In [None]:
## Activate virtual env
# $ !python3 -m virtualenv .venv
# $ !source .venv/bin/activate

## Install OpenAI package
# $ !pip install openai

## Export Gilas.io API key
# $ os.environ["GILAS_API_KEY"]='...'

In [None]:
import os
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()  

client = OpenAI(
    api_key=os.environ.get("GILAS_API_KEY"),
    base_url="https://api.gilas.io/v1/"
)

In [None]:
def get_embedding(text, dimentions=1531, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], dimensions=dimentions, model=model).data[0].embedding

In [None]:
in_1 = "Missing flamingo discovered at swimming pool"

in_2 = "Sea otter spotted on surfboard by beach"

in_3 = "Baby panda enjoys boat ride"


in_4 = "Breakfast themed food truck beloved by all!"

in_5 = "New curry restaurant aims to please!"


in_6 = "Python developers are wonderful people"

in_7 = "TypeScript, C++ or Java? All are great!" 


input_text_lst_news = [in_1, in_2, in_3, in_4, in_5, in_6, in_7]

In [None]:
import numpy as np

embeddings = []
for input_text in input_text_lst_news:
    embeddings.append(get_embedding(input_text))
    
embeddings_array = np.array(embeddings) 

In [None]:
print("Shape: " + str(embeddings_array.shape))
print(embeddings_array)

In [None]:
#### Reduce embeddings from 768 to 2 dimensions for visualization
# - We'll use principal component analysis (PCA).
# - You can learn more about PCA in [this video](https://www.coursera.org/learn/unsupervised-learning-recommenders-reinforcement-learning/lecture/73zWO/reducing-the-number-of-features-optional) from the Machine Learning Specialization. 

from sklearn.decomposition import PCA

# Perform PCA for 2D visualization
PCA_model = PCA(n_components = 2)
PCA_model.fit(embeddings_array)
new_values = PCA_model.transform(embeddings_array)

In [None]:
print("Shape: " + str(new_values.shape))
print(new_values)

In [None]:
import matplotlib.pyplot as plt
import mplcursors

def plot_2D(x_values, y_values, labels):

    # Create scatter plot
    fig, ax = plt.subplots()
    scatter = ax.scatter(x_values,
                         y_values,
                         alpha=0.5,
                         edgecolors='k',
                         s=40)

    # Create a mplcursors object to manage the data point interaction
    cursor = mplcursors.cursor(scatter, hover=True)

    # aes
    ax.set_title('Embedding visualization in 2D')  # Add a title
    ax.set_xlabel('X_1')  # Add x-axis label
    ax.set_ylabel('X_2')  # Add y-axis label

    # Define how each annotation should look
    @cursor.connect("add")
    def on_add(sel):
        sel.annotation.set_text(labels[sel.target.index])
        sel.annotation.get_bbox_patch().set(facecolor='white', alpha=0.5)  # Set annotation's background color
        sel.annotation.set_fontsize(12)

    plt.show()


In [None]:
plot_2D(new_values[:,0], new_values[:,1], input_text_lst_news)

In [None]:
# Reduce the dimentions during embedding generation

embeddings = []
for input_text in input_text_lst_news:
    embeddings.append(get_embedding(input_text, dimentions=2))
    
embeddings_array = np.array(embeddings) 

In [None]:
display(embeddings_array)

In [None]:
# We expect that the accuracy should drop

plot_2D(embeddings_array[:,0], embeddings_array[:,1], input_text_lst_news)