In [0]:
%pip install sentence-transformers
%pip install databricks-vectorsearch mlflow openai
dbutils.library.restartPython()

In [0]:
documentos = [
    "La capital de Francia es París.",
    "París es conocida como la Ciudad de la Luz.",
    "Londres es la capital del Reino Unido.",
    "El fútbol es el deporte más popular en el Reino Unido.",
    "Lionel Messi ganó su séptimo Balón de Oro.",
    "El aprendizaje automático es una rama de la inteligencia artificial.",
    "La inteligencia artificial está revolucionando muchas industrias.",
    "La nueva película de ciencia ficción fue un éxito de taquilla.",
    "Los gatos son mascotas populares en todo el mundo.",
    "Python es un lenguaje de programación muy utilizado en ciencia de datos."
]
from pyspark.sql.functions import monotonically_increasing_id

df_docs = spark.createDataFrame([(d,) for d in documentos], ["text"]).withColumn("id", monotonically_increasing_id())
display(df_docs)


In [0]:
from openai import OpenAI
import os
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, FloatType

# How to get your Databricks token: https://docs.databricks.com/en/dev-tools/auth/pat.html
# DATABRICKS_TOKEN = os.environ.get('DATABRICKS_TOKEN')
# Alternatively in a Databricks notebook you can use this:
DATABRICKS_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()

def get_embedding(text):
    client = OpenAI(
        api_key=DATABRICKS_TOKEN,
        base_url="https://dbc-504c50c9-8143.cloud.databricks.com/serving-endpoints"
    )
    response = client.embeddings.create(input=text, model="databricks-gte-large-en")
    return response.data[0].embedding

get_embeddings_udf = udf(get_embedding, ArrayType(FloatType()))

df_docs_embeddings = df_docs.withColumn("embedding", get_embeddings_udf(col("text")))
display(df_docs_embeddings)

In [0]:
%sql
USE CATALOG `big_data_ii_2025`;
USE SCHEMA `spark_examples`;
DROP TABLE IF EXISTS test_text_embeddings;
CREATE TABLE test_text_embeddings (
    id INT,
    text STRING,
    embedding ARRAY<FLOAT>
) USING DELTA;

In [0]:
%sql
ALTER TABLE big_data_ii_2025.spark_examples.test_text_embeddings
SET TBLPROPERTIES (delta.enableChangeDataFeed = true);

In [0]:
from databricks.vector_search.client import VectorSearchClient

vsc = VectorSearchClient()

vsc.create_delta_sync_index(
    endpoint_name="test_endpoint",
    source_table_name="big_data_ii_2025.spark_examples.test_text_embeddings",
    index_name="big_data_ii_2025.spark_examples.test_vector_search_index",
    pipeline_type="TRIGGERED",
    primary_key="id",
    embedding_dimension=1024,
    embedding_vector_column="embedding"
)

In [0]:
existing_table_schema = spark.table("big_data_ii_2025.spark_examples.test_text_embeddings").schema

# Cast the DataFrame to the existing table schema
df_docs_embeddings_casted = df_docs_embeddings.select(
    [df_docs_embeddings[col.name].cast(col.dataType).alias(col.name) for col in existing_table_schema]
)

# Write the DataFrame to the Delta table
df_docs_embeddings_casted.write.format("delta").mode("append").saveAsTable("big_data_ii_2025.spark_examples.test_text_embeddings")

In [0]:
vsc = VectorSearchClient()

# Trigger indexing for the vector search index
# vsc.start_indexing(
#     index_name="big_data_ii_2025.spark_examples.test_vector_search_index"
# )

index = vsc.get_index(
    index_name="big_data_ii_2025.spark_examples.test_vector_search_index"
)
index.sync()


In [0]:
print(index.scan(num_results=10))

index.describe()
# Get the number of documents in the index
# num_documents = index.get_document_count()

# print(f"Number of documents in the index: {num_documents}")

In [0]:
consulta = "¿Cuál es la capital de Inglaterra?"

query_vector = get_embedding(consulta)
# print(query_vector)
# Query the index for similar vectors
# results = vsc.query_vector_search_index(
#     name="big_data_ii_2025.spark_examples.test_vector_search_index",
#     query_vector=query_vector,
#     num_results=2
# )

# print(results)

index = vsc.get_index(
    index_name="big_data_ii_2025.spark_examples.test_vector_search_index"
)
results = index.similarity_search(
    query_vector=query_vector,
    num_results=2,
    columns=["id", "text"]
)

print(results)