In [None]:
## Activate virtual env
# $ !python3 -m virtualenv .venv
# $ !source .venv/bin/activate

## Install OpenAI package
# $ !pip install openai

## Export Gilas.io API key
# $ os.environ["GILAS_API_KEY"]='...'

In [None]:
import os
import tiktoken
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()  

client = OpenAI(
    api_key=os.environ.get("GILAS_API_KEY"),
    base_url="https://api.gilas.io/v1/"
)

In [None]:
def get_embedding(text, dimentions=1531, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], dimensions=dimentions, model=model).data[0].embedding

In [None]:
import pandas as pd

# load & inspect dataset
input_datapath = "../data/reviews.csv"  # to save space, we provide a pre-filtered dataset
df = pd.read_csv(input_datapath, index_col=0, delimiter=";")
df = df[["ProductId", "Score", "Text"]]
df = df.dropna()
df.head(2)

In [None]:
import numpy as np

dimentions = 100
embedding_array = np.empty((0, dimentions))

for index, row in df.iterrows():
    embedding_array = np.append(embedding_array, [get_embedding(row['Text'], dimentions=dimentions)], axis=0) 

In [None]:
print(embedding_array.shape)
display(embedding_array)

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters, 
                random_state=0, 
                n_init = 'auto').fit(embedding_array)

In [None]:
kmeans_labels = kmeans.labels_

In [None]:
PCA_model = PCA(n_components=2)
PCA_model.fit(embedding_array)
new_values = PCA_model.transform(embedding_array)

In [None]:
import matplotlib.pyplot as plt
import mplcursors  # Ensure mplcursors is installed

def clusters_2D(x_values, y_values, labels, kmeans_labels):
    # Create the plot
    fig, ax = plt.subplots()
    scatter = ax.scatter(x_values,
                         y_values,
                         c=kmeans_labels,
                         cmap='Set1',
                         alpha=0.5,
                         edgecolors='k',
                         s=40)  # Marker size

    # Create a mplcursors object for interactive data point inspection
    cursor = mplcursors.cursor(scatter, hover=True)

    # Set axes titles and labels
    ax.set_title('Embedding clusters visualization in 2D')
    ax.set_xlabel('X_1')  # X-axis label
    ax.set_ylabel('X_2')  # Y-axis label

    # Define how each annotation (data point label) should look
    @cursor.connect("add")
    def on_add(sel):
        # Assuming 'labels' is an object with a 'category' attribute that's indexable
        sel.annotation.set_text(labels.category[sel.target.index])
        sel.annotation.get_bbox_patch().set(facecolor='white', alpha=0.95)
        sel.annotation.set_fontsize(14)

    plt.show()


In [None]:
clusters_2D(x_values = new_values[:,0], y_values = new_values[:,1], 
            labels = df, kmeans_labels = kmeans_labels)