In [2]:
import numpy as np
import os
import urllib.request as ur
import ipywidgets as widgets

from openai import OpenAI
from IPython.display import IFrame, HTML

<h3> Creating a client object to call the APIs.

In [3]:
client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    # or you can explicitly pass in the key (NOT RECOMMENDED)
    api_key=os.getenv("OPENAI_KEY"),
)

client

<openai.OpenAI at 0x7fe08850b790>

<h3> The GPT models: </h3>
    
Here is an example to call the completions API and check that the key is working.

In [4]:
response = client.chat.completions.create(
  model="gpt-3.5-turbo-1106",
  messages=[
    {
        "role": "system",
         "content": "You are a skilled Python programmer."
    },
    {
        "role": "user",
        "content": "Generate a function that adds 2 numbers from the prompt."
    }
  ]
)

print(response.model_dump()['choices'][0]['message']['content'])

Certainly! Here's a Python function that prompts the user to enter two numbers, adds them together, and then returns the result:

```python
def add_two_numbers():
    num1 = float(input("Enter the first number: "))
    num2 = float(input("Enter the second number: "))
    result = num1 + num2
    return result

# Example usage
sum_result = add_two_numbers()
print("The sum of the two numbers is:", sum_result)
```

When you run this code, it will prompt you to enter two numbers, add them together, and then display the result.


<h4>Getting information about the tokens consumption for this request.

In [5]:
print(response.model_dump()['usage'])

{'completion_tokens': 124, 'prompt_tokens': 30, 'total_tokens': 154}


You can compute the cost of each call using the `chat.compleations` API with the following formula: `((promt_tokens * <cost_of_input_tokens>) + (completion_tokens * <cost_of_output_tokens>)) / 1000`

For instance, this request using  `gpt-3.5-turbo-1106` the cost is:

In [6]:
def cost_calculator_for_GPT_3_5_turbo(response):

    # These 2 values are valid only for the "gpt-3.5-turbo-1106" model.
    # Check https://openai.com/pricing for up-to-date prices
    cost_of_input_tokens = 0.001
    cost_of_output_tokens = 0.002

    completion_tokens = response.model_dump()['usage']['completion_tokens']
    prompt_tokens = response.model_dump()['usage']['prompt_tokens']

    total_cost = (
        (prompt_tokens * cost_of_input_tokens) + (completion_tokens * cost_of_output_tokens)
    ) / 1000

    return f"Total cost for API call: ${total_cost} USD"

cost_calculator_for_GPT_3_5_turbo(response)

'Total cost for API call: $0.00027800000000000004 USD'

<h4>Formatting the output generated from the API</h4>

You can use `response_format` argument from the `chat.completions` API to structure the data generated by the model:

In [6]:
response = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Can you generate json with information from your favorite actor?",
        }
    ],
    model="gpt-3.5-turbo-1106",
    response_format={"type": "json_object"},
)
print(cost_calculator_for_GPT_3_5_turbo(response))
print(response.model_dump()['choices'][0]['message']['content'])

Total cost for API call: $0.000314 USD
{
  "actor": "Tom Hanks",
  "birthdate": "July 9, 1956",
  "birthplace": "Concord, California, USA",
  "nationality": "American",
  "best_known_for": "Forrest Gump, Saving Private Ryan, Cast Away, and Toy Story",
  "awards": {
    "Academy Awards": 2,
    "Golden Globe Awards": 7,
    "Emmy Awards": 1
  },
  "marital_status": "Married",
  "spouse": "Rita Wilson",
  "children": 4,
  "other_interests": ["directing", "producing", "writing"]
}


<h4>Mind the temeperature!</h4>

You can use `temperature` to spice up the responses you get back from the models, but keep in mind that if you increase the `temperature` too much, things might not make much sense...

In [7]:
response = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Can you generate json with information from your favorite actor?",
        }
    ],
    model="gpt-3.5-turbo-1106",
    response_format={"type": "json_object"},
    temperature=1.9,
)
print(cost_calculator_for_GPT_3_5_turbo(response))
print(response.model_dump()['choices'][0]['message']['content'])

Total cost for API call: $0.000124 USD
{                     
  	"firstName": "Leonardo",
	    "lastName": "DiCaprio",
	    "age": 47,
	    	  "careerBegin": 76956   	               
                         
                    
      



             
  								             
                                                                                                                                                                                
	
                                                                 
}


<h4>Multiple answers and limiting tokens for the generated output</h4>

You can use `n` argument to set the number of answers you want to get from the input prompt. While the `max_tokens` will limit the lenght of the answers generated by the model.

In [8]:
response = client.chat.completions.create(
    model="gpt-3.5-turbo-1106",
    messages=[{"role": "user", "content": "Is Pluto a planet?"}],
    temperature=1.5,
    n=3,
    max_tokens=20,
)

print(response.model_dump()['usage'], '\n')
print(cost_calculator_for_GPT_3_5_turbo(response), '\n')
for i in response.model_dump()['choices']:
    print(f"""Option {i['index'] + 1}:

    {i['message']['content']}

    """)

{'completion_tokens': 60, 'prompt_tokens': 12, 'total_tokens': 72} 

Total cost for API call: $0.000132 USD 

Option 1: 
    
    According to the International Astronomical Union's definition, Pluto is considered a dwarf planet rather than a

    
Option 2: 
    
    The status of Pluto changed in 2006 when the International Astronomical Union (IAU)

    
Option 3: 
    
    According to the International Astronomical Union, Pluto is classified as a "dwarf planet." While

    


Note that for the previous example we set `n=3` and `max_tokens=20`, but the `completion_tokens` value was 60! 

This means that each one of the 3 outputs contains `20 tokens`, that is why the total amount of output tokens was 60. It is also worth note that even if each answer contains of 20 tokens, the strings that they represent have different lenghts.



<h3>Embeddings</h3>

Here you will see how to create embeddings for any string you send as `input` to the `ada 2` model.

Note that the dimension of the embeddings is currently fixed to 1536 by the model.

In [9]:
response = client.embeddings.create(
    input="I am going to convert this into a vector!",
    model="text-embedding-ada-002"
)
embeddings = response.model_dump()['data'][0]['embedding']
print("Dimension of the vector:", len(embeddings))
embeddings[:10]

Dimension of the vector: 1536


[-0.029523631557822227,
 0.002989797852933407,
 -0.007745315786451101,
 -0.001285364618524909,
 0.010958727449178696,
 0.012396479956805706,
 0.003584444522857666,
 -0.014231769368052483,
 -0.0057973917573690414,
 -0.03909098729491234]

Following a classic example of word semantics being (more or less) preserved by the embeddings representing them, we can see how much of the relationships between the words is preserved through vector operations.

For instance, using the set of words: `Queen`, `King`, `Woman` and `Man`, and by looking at their embeddings in 2D from the sample image below, one could think that the following operation should hold: `king + woman − man ≈ approx_queen`.

In [10]:
embedding_img = (
    "https://www.researchgate.net/profile/Peter-Sutor/publication/"
    "332679657/figure/fig1/AS:809485488640000@1570007788866/"
    "The-classical-king-woman-man-queen-example-of-neural-word-embeddings-in-2D-It.png"
)
IFrame(src=embedding_img, width=800, height=400)

So, let us see if the embeddings generated by the `ada 2` model hold some of these relationships.

First, we will compute the embeddings for each of this words:

In [7]:
response = client.embeddings.create(
    input="Man",
    model="text-embedding-ada-002"
)
man_embedding = np.array(response.model_dump()['data'][0]['embedding'])

response = client.embeddings.create(
    input="King",
    model="text-embedding-ada-002"
)
king_embedding = np.array(response.model_dump()['data'][0]['embedding'])

response = client.embeddings.create(
    input="Woman",
    model="text-embedding-ada-002"
)
woman_embedding = np.array(response.model_dump()['data'][0]['embedding'])

response = client.embeddings.create(
    input="Queen",
    model="text-embedding-ada-002"
)
queen_embedding = np.array(response.model_dump()['data'][0]['embedding'])

Once we have the embeddings for each word, we can proceed to compute the `approx_queen` embedding:

In [8]:
approx_queen = king_embedding + woman_embedding - man_embedding

To see how close the `approx_queen` is to the `queen_embedding`, we can compute the `cosine similarity` of the 2 vectors with the following function:

In [9]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

Remember that the closer the value to 1 the more similar the vectors will be:

In [10]:
cosine_similarity(queen_embedding, queen_embedding)

1.0

In [11]:
cosine_similarity(approx_queen, queen_embedding)

0.8855695132090731

In [12]:
cosine_similarity(approx_queen, man_embedding)

0.6827685781080028

In [13]:
cosine_similarity(approx_queen, woman_embedding)

0.8520431622631569

In [14]:
cosine_similarity(approx_queen, king_embedding)

0.9176379447582215

So, after performing all the comparisons, we can see that the `approx_queen` embedding is closer to `king_embedding`, although it the is the most distant to the `man_embedding`.


<h3> Images with the Dall-E models</h3>

First, you will use the `dall-e-3` model to generate images from a prompt:

In [15]:
image_size = 1024

response = client.images.generate(
    model="dall-e-3",
    prompt="""
    Everything should be painted with the Van Gogh style.
    A dinosaur playing guitar with flames in the background.
    """,
    size=f"{image_size}x{image_size}"
)

image_url = response.model_dump()['data'][0]['url']
IFrame(src=image_url, width=image_size, height=image_size)

<h4>Image edition</h4>


The `edit` API lets you modify an `image` using a `mask` from the same image.

For this part we will need to load a couple of images from the `resources` directory, so make sure to update the path accordingly to your setup!

In [20]:
# Please change this path to match your directory structure
media_path = "resources/"

chihuahua = open(media_path + "chihuahua.png", "rb")
chihuahua_mask = open(media_path + "chihuahua_mask.png", "rb")

Please note that only the transparent areas from the `mask` will be used for editing!

In [21]:
image_size = 256

response = client.images.edit(
    image=chihuahua,
    mask=chihuahua_mask,
    prompt="""
    The chihuahua is on a trail in Hawaii with an active volcano in the background.
    """,
    n=1,
    response_format='url',
    size=f"{image_size}x{image_size}"
)

image_url = response.model_dump()['data'][0]['url']
chihuahua_edit = ur.urlretrieve(image_url)[0]
IFrame(src=image_url, width=image_size, height=image_size)

You can see below the original `image` and the `mask` used in the `edit` call from above.

In [22]:
chihuahua.seek(0)
chihuahua_mask.seek(0)

img1=chihuahua.read()
wi1 = widgets.Image(value=img1, format='jpg', width=image_size, height=image_size)
img2=chihuahua_mask.read()
wi2 = widgets.Image(value=img2, format='jpg', width=image_size, height=image_size)
img3=open(chihuahua_edit, 'rb').read()
wi3 = widgets.Image(value=img3, format='jpg', width=image_size, height=image_size)

box=[wi1,wi2,wi3]
chihuahuas=widgets.HBox(box)
display(chihuahuas)

HBox(children=(Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x01\x00\x00\x00\x01\x00\x08\x02\x00\x…

<h4>Image variation</h4>

This will let you create a random variation of the original `image` provided to the API call.

In [23]:
image_size = 256

response = client.images.create_variation(
    image=chihuahua,
    n=1,
    response_format='url',
    size=f"{image_size}x{image_size}"
)

image_url = response.model_dump()['data'][0]['url']
IFrame(src=image_url, width=image_size, height=image_size)

<h2>Audio with Whisper</h2>

`whisper 1` will let you generate `transcriptions` from the audio files provided. By specifying the orignal language of the audio, you can help the model to get a faster and more accurate result!

Make sure to check the limits for duration and file size in the documentation.

In [24]:
def show_audio_with_controls(file_path):
    display(HTML("<audio controls><source src={} type='audio/mpeg'></audio>".format(file_path)))

english_audio_file_path = media_path + "english_audio.mp3"

show_audio_with_controls(english_audio_file_path)

In [26]:
transcript = client.audio.transcriptions.create(
    model="whisper-1",
    file=open(english_audio_file_path, "rb"),
    language="en"
)

transcript.text

"So, you're a travel agency that provides programs in Hajj and Umrah, and you're searching for the best hotels and services with extraordinary discounts. Well, look no further. Eid Hijri is giving travel agencies all over the world the opportunity to join its international alliance of agencies in Hajj and Umrah sectors."

Note that this API also lets you play with the temperature, so you can see how much the transcription changes as you play with this parameter.

In [27]:
transcript = client.audio.transcriptions.create(
    model="whisper-1",
    file=open(english_audio_file_path, "rb"),
    language="en",
    temperature=1.3,
)
transcript.text

"So, you're a travel pharmacy that serves as a дом 음식"

<h4>Translations</h4>

This API will help you to generate `translations` from any language in the list of supported languages by Open AI, to a text in English.

You can test it with the following audio in Spanish:

In [28]:
spanish_audio_file_path = media_path + "opiniones.mp3"

show_audio_with_controls(spanish_audio_file_path)

In [29]:
audio_file= open(spanish_audio_file_path, "rb")
translation = client.audio.translations.create(
  model="whisper-1",
  file=audio_file
)

translation.text

"I love it, I like it very much, I like it, I don't like it, I don't like anything, I hate it I love them, I like them, I don't like them, I don't like anything, I hate"

<h3>Text To Speech with TTS 1</h3>

Pretty straightforward, type in whatever you want to say, choose the type of voice from the selection, point to the file where you want to store the audio and enjoy!

In [30]:
output_speech_file = media_path + "generated_speech.mp3"

response = client.audio.speech.create(
  model="tts-1",
  voice="alloy",
  input="Look mom, I am coding in Python!"
)

response.stream_to_file(output_speech_file)

In [31]:
show_audio_with_controls(output_speech_file)