##### Copyright 2023 Google LLC

In [None]:
# @title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Setup

In [8]:
import json

def structure_instructions(raw_instructions):
    """
    Structures unstructured instructions into a JSON dataset for LLM training.

    Args:
        raw_instructions: A string containing unstructured instructions.

    Returns:
        A JSON string representing the structured instruction dataset.  Returns an error message if input is invalid.
    """
    try:
        # Basic cleaning (remove extra whitespace, etc.)
        cleaned_instructions = ' '.join(raw_instructions.split())

        #  (SIMULATED LLM Processing - Replace with your actual LLM call)
        #  This section simulates the LLM's ability to structure the instructions.
        #  In a real application, you would replace this with a call to your LLM API,
        #  passing `cleaned_instructions` as input.
        structured_data = simulate_llm_structuring(cleaned_instructions)

        return json.dumps(structured_data, indent=2)

    except Exception as e:
        return f"Error structuring instructions: {e}"


def simulate_llm_structuring(instructions):
    # Simulates the LLM's output - Replace with your actual LLM call
    # This example assumes the LLM can identify sections, prompts, and key parameters.
    sections = instructions.split("Section:")
    structured_data = {"sections": []}
    for section in sections[1:]:  # Skip the first element which is just an empty string
        parts = section.strip().split("\nPrompt")
        section_name = parts[0].strip()
        prompts = []
        for prompt in parts[1:]:
            cleaned_prompt = prompt.strip().replace("Prompt", "").strip()
            prompts.append(cleaned_prompt)
        structured_data["sections"].append({"name": section_name, "prompts": prompts})
    return structured_data


# Define the file path (replace with your actual file path)
file_path = '/content/tempfiles/instructions.txt'

# Open and read the file
with open(file_path, 'r') as file:
    raw_instructions = file.read()

# Now you can use raw_instructions as before
structured_dataset = structure_instructions(raw_instructions)
print(structured_dataset)

{
  "sections": []
}


In [None]:
import os
import google.generativeai as genai

genai.configure(api_key="AIzaSyBJRJxtmWvcY7S9ukaTuHaThJzBAN68mvc")
model = genai.GenerativeModel(model_name="gemini-1.5-flash") # Or your chosen model
response = model.generate_text(cleaned_instructions,  # Add other parameters as needed for your LLM.
                                 temperature=0.7,
                                 max_output_tokens=1024)
structured_data = json.loads(response.text) # Assuming the LLM returns JSON

In [12]:
import os
import google.generativeai as genai
import json


def structure_instructions_with_llm(raw_instructions):
    """
    Structures unstructured instructions into a JSON dataset for LLM training using Google Generative AI.

    Args:
        raw_instructions: A string containing unstructured instructions.

    Returns:
        A JSON object representing the structured instruction dataset. Returns an error message if input is invalid.
    """
    try:
        # Basic cleaning (remove extra whitespace, etc.)
        cleaned_instructions = ' '.join(raw_instructions.split())

        # Initialize Generative AI
        genai.configure(api_key="AIzaSyBJRJxtmWvcY7S9ukaTuHaThJzBAN68mvc")

        # Create the model
        generation_config = {
            "temperature": 1,
            "top_p": 0.95,
            "top_k": 40,
            "max_output_tokens": 8192,
            "response_mime_type": "text/plain",
        }

        model = genai.GenerativeModel(
            model_name="gemini-1.5-flash",
            generation_config=generation_config,
        )

        # Start a chat session
        chat_session = model.start_chat(
            history=[]  # You can include previous interactions in the history
        )

        # Generate text using the chat session
        response = chat_session.send_message(cleaned_instructions)  # Provide cleaned_instructions as input

        # Extract and process the generated text
        structured_data = json.loads(response.text)  # Assuming the LLM returns JSON
        return structured_data

    except Exception as e:
        return f"Error structuring instructions with LLM: {e}"


# Define the file path (replace with your actual file path)
file_path = '/content/tempfiles/instructions.txt'

# Open and read the file
with open(file_path, 'r') as file:
    raw_instructions = file.read()

# Call the function to structure the instructions
structured_dataset = structure_instructions_with_llm(raw_instructions)

# Print the structured dataset
print(json.dumps(structured_dataset, indent=2))


"Error structuring instructions with LLM: Expecting value: line 1 column 1 (char 0)"


In [14]:
import os
import json
import google.generativeai as genai

# Configure the GenAI API
genai.configure(api_key="AIzaSyBJRJxtmWvcY7S9ukaTuHaThJzBAN68mvc")

# Create the model
generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 40,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    generation_config=generation_config,
)

def clean_text(raw_text):
    """
    Cleans the raw text by removing extra whitespaces and formatting issues.
    """
    return ' '.join(raw_text.split())

def validate_json_schema(data, schema):
    """
    Validates the JSON data against a schema.

    Args:
        data: The JSON object to validate.
        schema: The schema to validate against.

    Returns:
        Tuple (is_valid, error_message)
    """
    try:
        from jsonschema import validate
        validate(instance=data, schema=schema)
        return True, None
    except Exception as e:
        return False, str(e)

def structure_instructions_with_llm(raw_instructions):
    """
    Structures unstructured instructions into a JSON dataset for AI model fine-tuning.
    """
    try:
        # Clean the raw instructions
        cleaned_instructions = clean_text(raw_instructions)
        print("Cleaned Instructions:", cleaned_instructions)

        # Start a chat session
        chat_session = model.start_chat(history=[])
        response = chat_session.send_message(cleaned_instructions)

        # Debug: Log the raw response
        print("Raw Response:")
        print(response)

        # Extract and validate the structured data
        try:
            structured_data = json.loads(response.text)
        except json.JSONDecodeError as e:
            raise ValueError(f"Invalid JSON response: {response.text}")

        # Define a JSON schema for validation
        schema = {
            "type": "object",
            "properties": {
                "sections": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "name": {"type": "string"},
                            "prompts": {"type": "array", "items": {"type": "string"}}
                        },
                        "required": ["name", "prompts"]
                    }
                }
            },
            "required": ["sections"]
        }

        # Validate the structured data against the schema
        is_valid, error_message = validate_json_schema(structured_data, schema)
        if not is_valid:
            raise ValueError(f"Invalid JSON structure: {error_message}")

        return structured_data

    except Exception as e:
        return f"Error structuring instructions with LLM: {e}"

def save_as_dataset(structured_data, output_path):
    """
    Saves the structured data as a dataset in CSV format.

    Args:
        structured_data: The structured JSON data.
        output_path: The file path to save the dataset.
    """
    import csv

    try:
        with open(output_path, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)

            # Write headers
            writer.writerow(["Section Name", "Prompt", "Label"])

            # Write rows
            for section in structured_data["sections"]:
                section_name = section["name"]
                for prompt in section["prompts"]:
                    # Auto-generate a label (e.g., based on section or content length)
                    label = f"{section_name[:3].upper()}_{len(prompt)}"
                    writer.writerow([section_name, prompt, label])

        print(f"Dataset saved at {output_path}")
    except Exception as e:
        print(f"Error saving dataset: {e}")

# Define the file path (replace with your actual file path)
file_path = '/content/tempfiles/instructions.txt'
output_dataset_path = '/content/tempfiles/structured_dataset.csv'

# Open and read the file
with open(file_path, 'r', encoding='utf-8') as file:
    raw_instructions = file.read()

# Structure the instructions and save the dataset
structured_dataset = structure_instructions_with_llm(raw_instructions)

if isinstance(structured_dataset, dict):  # Check if the output is valid structured data
    save_as_dataset(structured_dataset, output_dataset_path)
else:
    print(structured_dataset)  # Print error message


Raw Response:
response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
                "text": "$${\\color{lightgreen}DuinoBot}$$ is at your service, sir.  I'm ready to help! \ud83d\ude0a\n"
              }
            ],
            "role": "model"
          },
          "finish_reason": "STOP",
          "avg_logprobs": -0.11001825332641602
        }
      ],
      "usage_metadata": {
        "prompt_token_count": 92580,
        "candidates_token_count": 28,
        "total_token_count": 92608
      }
    }),
)
Error structuring instructions with LLM: Invalid JSON response: $${\color{lightgreen}DuinoBot}$$ is at your service, sir.  I'm ready to help! 😊



In [1]:
# import necessary modules.
import base64
import copy
import json
import pathlib
import requests


import PIL.Image
import IPython.display
from IPython.display import Markdown

try:
    # The SDK will automatically read it from the GOOGLE_API_KEY environment variable.
    # In Colab get the key from Colab-secrets ("🔑" in the left panel).
    import os
    from google.colab import userdata

    os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")
except ImportError:
    pass

import google.generativeai as genai

# Parse the arguments

model = 'gemini-1.5-flash' # @param {isTemplate: true}
contents_b64 = 'W10=' # @param {isTemplate: true}
generation_config_b64 = 'eyJ0ZW1wZXJhdHVyZSI6MSwidG9wX3AiOjAuOTUsInRvcF9rIjo0MCwibWF4X291dHB1dF90b2tlbnMiOjgxOTJ9' # @param {isTemplate: true}
safety_settings_b64 = "e30="  # @param {isTemplate: true}

gais_contents = json.loads(base64.b64decode(contents_b64))

generation_config = json.loads(base64.b64decode(generation_config_b64))
safety_settings = json.loads(base64.b64decode(safety_settings_b64))

stream = False

# Convert and upload the files

tempfiles = pathlib.Path(f"tempfiles")
tempfiles.mkdir(parents=True, exist_ok=True)


drive = None
def upload_file_data(file_data, index):
    """Upload files to the Files API.

    For each file, Google AI Studio either sent:
    - a Google Drive ID,
    - a URL,
    - a file path, or
    - The raw bytes (`inline_data`).

    The API only understands `inline_data` or it's Files API.
    This code, uploads files to the files API where the API can access them.
    """

    mime_type = file_data["mime_type"]
    if drive_id := file_data.pop("drive_id", None):
        if drive is None:
          from google.colab import drive
          drive.mount("/gdrive")

        path = next(
            pathlib.Path(f"/gdrive/.shortcut-targets-by-id/{drive_id}").glob("*")
        )
        print("Uploading:", str(path))
        file_info = genai.upload_file(path=path, mime_type=mime_type)
        file_data["file_uri"] = file_info.uri
        return

    if url := file_data.pop("url", None):
        response = requests.get(url)
        data = response.content
        name = url.split("/")[-1]
        path = tempfiles / str(index)
        path.write_bytes(data)
        print("Uploading:", url)
        file_info = genai.upload_file(path, display_name=name, mime_type=mime_type)
        file_data["file_uri"] = file_info.uri
        return

    if name := file_data.get("filename", None):
        if not pathlib.Path(name).exists():
            raise IOError(
                f"local file: `{name}` does not exist. You can upload files "
                'to Colab using the file manager ("📁 Files" in the left '
                "toolbar)"
            )
        file_info = genai.upload_file(path, display_name=name, mime_type=mime_type)
        file_data["file_uri"] = file_info.uri
        return

    if "inline_data" in file_data:
        return

    raise ValueError("Either `drive_id`, `url` or `inline_data` must be provided.")


contents = copy.deepcopy(gais_contents)

index = 0
for content in contents:
    for n, part in enumerate(content["parts"]):
        if file_data := part.get("file_data", None):
            upload_file_data(file_data, index)
            index += 1

import json
print(json.dumps(contents, indent=4))

[]


## Call `generate_content`

In [None]:
from IPython.display import display
from IPython.display import Markdown

# Call the model and print the response.
gemini = genai.GenerativeModel(model_name=model)

response = gemini.generate_content(
    contents,
    generation_config=generation_config,
    safety_settings=safety_settings,
    stream=stream,
)

display(Markdown(response.text))

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://ai.google.dev/gemini-api/docs"><img src="https://ai.google.dev/static/site-assets/images/docs/notebook-site-button.png" height="32" width="32" />Docs on ai.google.dev</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/google-gemini/cookbook/blob/main/quickstarts"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />More notebooks in the Cookbook</a>
  </td>
</table>

## [optional] Show the conversation

This section displays the conversation received from Google AI Studio.

In [None]:
# @title Show the conversation, in colab.
import mimetypes

def show_file(file_data):
    mime_type = file_data["mime_type"]

    if drive_id := file_data.get("drive_id", None):
        path = next(
            pathlib.Path(f"/gdrive/.shortcut-targets-by-id/{drive_id}").glob("*")
        )
        name = path
        # data = path.read_bytes()
        kwargs = {"filename": path}
    elif url := file_data.get("url", None):
        name = url
        kwargs = {"url": url}
        # response = requests.get(url)
        # data = response.content
    elif data := file_data.get("inline_data", None):
        name = None
        kwargs = {"data": data}
    elif name := file_data.get("filename", None):
        if not pathlib.Path(name).exists():
            raise IOError(
                f"local file: `{name}` does not exist. You can upload files to "
                'Colab using the file manager ("📁 Files"in the left toolbar)'
            )
    else:
        raise ValueError("Either `drive_id`, `url` or `inline_data` must be provided.")

        print(f"File:\n    name: {name}\n    mime_type: {mime_type}\n")
        return

    format = mimetypes.guess_extension(mime_type).strip(".")
    if mime_type.startswith("image/"):
        image = IPython.display.Image(**kwargs, width=256)
        IPython.display.display(image)
        print()
        return

    if mime_type.startswith("audio/"):
        if len(data) < 2**12:
            audio = IPython.display.Audio(**kwargs)
            IPython.display.display(audio)
            print()
            return

    if mime_type.startswith("video/"):
        if len(data) < 2**12:
            audio = IPython.display.Video(**kwargs, mimetype=mime_type)
            IPython.display.display(audio)
            print()
            return

    print(f"File:\n    name: {name}\n    mime_type: {mime_type}\n")


for content in gais_contents:
    if role := content.get("role", None):
        print("Role:", role, "\n")

    for n, part in enumerate(content["parts"]):
        if text := part.get("text", None):
            print(text, "\n")

        elif file_data := part.get("file_data", None):
            show_file(file_data)

    print("-" * 80, "\n")