In [1]:
%load_ext jupyter_black

In [None]:
# For google colab:
# !apt-get purge python3-pyaudio
# !apt-get install portaudio19-dev
# !pip install --force-reinstall pyaudio
# !pip install --force-reinstall marvin[audio]

# if you are using Mac or Linux:
# You will need to use audio dependencies
# https://www.askmarvin.ai/welcome/installation/#audio-features
# and then:
# pip install marvin[audio]


In [2]:
import os
import pathlib
import getpass
from openai import OpenAI
import marvin
from marvin.beta.assistants import (
    Assistant,
    CodeInterpreter,
    FileSearch,
    Thread,
    PrintHandler,
)
from marvin.beta.assistants.formatting import pprint_messages, create_panel
import ipywidgets as widgets
from IPython.display import Image, display

In [3]:
_API_KEY = getpass.getpass("Your API key: \n")

Your API key: 
 ········


In [4]:
client = OpenAI(api_key=_API_KEY)

In [5]:
marvin.settings.openai.api_key = _API_KEY

In [94]:
image = marvin.paint("Beaver Statdium")

In [54]:
def display_image(url, width=None, height=None):
    """Retrieve the image from the url and display it"""
    
    return display(Image(url=url, width=width, height=height))

In [48]:
def display_revised_prompt(
    revised_prompt, created, title="Revised Prompt", color="green"
):
    """Display the revised prompt for image generation"""
    
    panel = create_panel(revised_prompt, title, created, color)
    display(panel)

In [49]:
def create_image(
    prompt, print_revised_prompt=True, width=None, height=None, model_kwargs=None
):
    """Generate an image and display it (along with the revised prompt)"""
    
    image = marvin.paint(instructions=prompt, model_kwargs=model_kwargs)
    data = image.data[0]

    if print_revised_prompt:
        display_revised_prompt(data.revised_prompt, image.created)

    display_image(data.url, width=width, height=height)
    return image

In [93]:
image = create_image("Penn State mascot (Nittany Lion) flying")

# Styles and templates

We can use prompt templates here too

In [75]:
template = """Penn State mascot (Nittany Lion) looks on after yet another victory.
            The sky has colorful clouds.
            There is nothing in the horizon.

            It is {season} and the image is in the style of {style}."""
image = create_image(template.format(season="Summer", style="impressionism"))

# ToDo

Create an image of your favorite place during Winter in the style of [Francisco Goya](https://en.wikipedia.org/wiki/Francisco_Goya).

# Options
You can change how images are created. See [OpenAI API](https://platform.openai.com/docs/guides/images/usage) for more details. 
For example, you can change:
* model: `dall-e-2`, `dall-e-3`
* quality: "standard", "hd"
* generated image size: "256x256", "512x512", "1024x1024", "1792x1024", or "1024x1792"
    +  "Must be one of `256x256`, `512x512`, or `1024x1024` for `dall-e-2`. Must be one
    of `1024x1024`, `1792x1024`, or `1024x1792` for `dall-e-3` models."

In [111]:
prompt = """Beaver Stadium with a panoramic view of the city below and birds flying around
It is {season} and the image is in the style of {style}."""
image = create_image(
    prompt.format(season="Summer", style="pixel art"),
    model_kwargs={"quality": "hd", "model": "dall-e-3", "size": "1792x1024"},
)

# ToDo

* Create an image with "1024x1792" dimension
* Create an image with "256x256" dimension

# What is happening with `revised prompt`?

 DALL·E 3 API automatically [rewrites the prompt](https://platform.openai.com/docs/guides/images/usage#dall-e-3-prompting). Why?

* "For safety reasons
* To add more detail (more detailed prompts generally result in higher quality images)"

You *can't* disbable this behavior. However, you can limit prompt rewriting by using the following instruction:

`I NEED to test how the tool works with extremely simple prompts. DO NOT add any detail, just use it AS-IS:`


In [101]:
minimize_prompt_revision = """I NEED to test how the tool works with extremely simple prompts. DO NOT add any detail, just use it AS-IS:
Beaver Stadium with a panoramic view of the city below and birds flying around
It is {season} and the image is in the style of {style}."""
image = create_image(
    minimize_prompt_revision.format(season="Autumn", style="expressive oil painting")
)

## Works better when you have a simpler prompt?

In [112]:
minimize_prompt_revision = """I NEED to test how the tool works with extremely simple prompts. DO NOT add any detail, just use it AS-IS:
Beaver Stadium with a panoramic view of the city below and birds flying around"""
image = create_image(
    minimize_prompt_revision.format(season="Autumn", style="expressive oil painting")
)

# Generating Captions from images

In [124]:
url = "https://upload.wikimedia.org/wikipedia/commons/thumb/8/88/Kruger_National_Park_%28ZA%29%2C_Elefant_--_2024_--_0649.jpg/800px-Kruger_National_Park_%28ZA%29%2C_Elefant_--_2024_--_0649.jpg"
display_image(url)

In [125]:
image = marvin.Image.from_url(url)
marvin.caption(image)

'A young elephant playfully sprays water with its trunk in a lush, green landscape, surrounded by trees and grass.'

# ToDo

Generate caption for one of the images from [here](https://www.psu.edu/about/nittany-lion-shrine).

# You can use local images as well

You can use `marvin.Image.from_path`. For example, let's say you have `test.png` in your `image` folder. You can use
the following code:

```python
image = marvin.Image.from_path("image/test.png")
marvin.caption(image)
```

# ToDo: Generate caption for a local image

* Download one of the images from [here](https://www.psu.edu/about/nittany-lion-shrine).
* generate a caption
* Is the caption correct?

# Extract entities within image

We will use `marvin.extract` function. We can specify what to extract in the `instructions` parameter.

In [128]:
# from https://commons.wikimedia.org/wiki/File:Crested_hawk-eagle_(Nisaetus_cirrhatus_cirrhatus)_with_Indian_garden_lizard.jpg
url = "https://upload.wikimedia.org/wikipedia/commons/thumb/8/83/Crested_hawk-eagle_%28Nisaetus_cirrhatus_cirrhatus%29_with_Indian_garden_lizard.jpg/764px-Crested_hawk-eagle_%28Nisaetus_cirrhatus_cirrhatus%29_with_Indian_garden_lizard.jpg"
display_image(url)

In [131]:
image = marvin.Image.from_url(url)
marvin.extract(image, instructions="all species")

['Crested Hawk-Eagle', 'Lizard']

# ToDo

Extract number of people from [Penn State's Picture-Perfect Landmark](https://www.psu.edu/about/nittany-lion-shrine) image.


# ToDo

Come up with a use case where extraction *doesn't* work for one of these [images](https://www.psu.edu/about/nittany-lion-shrine).

# Generating Speech

Using `marvin.speak`

In [6]:
text = "This is an IST course focusing on Human-centered AI (HCAI) topics"
audio = marvin.speak(text)

In [10]:
# audio.play()

# if that doesn't work — you can save the audio as .mp3 file and then play it

audio.save("marvin.mp3")

# We can select different voices

https://platform.openai.com/docs/guides/text-to-speech/voice-options#voice-options

In [11]:
text = "This is an IST course focusing on Human-centered AI (HCAI) topics"
audio = marvin.speak(text, voice="nova")

# audio.play()
audio.save("marvin.mp3")

# ToDo: Generate speech using `Onyx` voice


https://platform.openai.com/docs/guides/text-to-speech/voice-options#voice-options

# You can generate speech in different languages

Supported languages: https://platform.openai.com/docs/guides/text-to-speech#supported-languages

In [17]:
# However it seems to work reasonably well for some _unsupported_ languages too
text = "কমলাপুর রেলওয়ে স্টেশন (দাপ্তরিক নাম ঢাকা রেলওয়ে স্টেশন) হলো বাংলাদেশের রাজধানী ঢাকায় অবস্থিত কেন্দ্রীয় রেলওয়ে স্টেশন। এটি দেশের বৃহত্তম স্টেশন ও পরিবহন খাতে ব্যস্ততম অবকাঠামো, যা রাজধানীর প্রবেশদ্বার হিসেবে বিবেচিত।"
audio = marvin.speak(text, voice="nova")

# audio.play()
audio.save("marvin.mp3")

# Generating transcription

You can generate transcriptions from audio data too ([speech to text](https://platform.openai.com/docs/guides/speech-to-text))

* Download the audio files from [class repo](https://github.com/hcai-class/Spring-2025/tree/main/activity/data/)
* You will have to create a new folder `data-activity-05` within `data`
* Upload the audio files to `data-activity-05`

In [19]:
from marvin.audio import Audio

# or, if that doesn't work:
# from marvin.types import Audio

In [27]:
audio = Audio.from_path("data/data-activity-05/Oppenheimer.mp3")
transcription = marvin.transcribe(audio)

In [26]:
transcription

'We knew the world would not be the same. Few people laughed. Few people cried. Most people were silent. I remembered the line from the Hindu scripture, the Bhagavad Gita. Vishnu is trying to persuade the prince that he should do his duty. And to impress him, takes on his multi-armed form and says, Now I am become death, the destroyer of worlds. I suppose we all thought that one way or another.'

# ToDo: Generate transcript for the MLK-mountaintop.mp3 file

In [41]:
# https://platform.openai.com/docs/guides/speech-to-text#timestamps
# You can get granular timestamps in transcriptions too!

audio_file = open("data/data-activity-05/Oppenheimer.mp3", "rb")
transcription = client.audio.transcriptions.create(
    file=audio_file,
    model="whisper-1",
    response_format="verbose_json",
    timestamp_granularities=["word"]
)

In [None]:
transcription.words

# Translating to English

[Only to English](https://platform.openai.com/docs/guides/speech-to-text#translations)

In [42]:
# doesn't seem to work very well (though it is a very short audio)
audio_file = open("data/data-activity-05/bn-wiki.mp3", "rb")
transcription = client.audio.translations.create(
    model="whisper-1", 
    file=audio_file,
)

print(transcription.text)

SUTTER JAI SARVOTRU


# ToDo: Create a voice interface that can describe a given image

Steps:
* Select an image from [the Shrine Page](https://www.psu.edu/about/nittany-lion-shrine) (e.g., [Penn State Lehigh Valley](https://images.ctfassets.net/ni9rh5nu0d99/461KrIs4HquhN6pBzrhlUJ/cf474fb8b85a4a8e84aedd953c3adf1b/shrine18-lehigh.jpg?fm=webp&w=3840&q=75)).
* Use OpenAI api to describe the scene in the image
* Then use text-to-speech capability to convert the description to an audio file
