In [0]:
%pip install sentence-transformers
%pip install databricks-vectorsearch mlflow openai
dbutils.library.restartPython()

In [0]:
from openai import OpenAI
import os
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, FloatType

# How to get your Databricks token: https://docs.databricks.com/en/dev-tools/auth/pat.html
# DATABRICKS_TOKEN = os.environ.get('DATABRICKS_TOKEN')
# Alternatively in a Databricks notebook you can use this:
DATABRICKS_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()

def get_embedding(text):
    client = OpenAI(
        api_key=DATABRICKS_TOKEN,
        base_url="https://dbc-504c50c9-8143.cloud.databricks.com/serving-endpoints"
    )
    response = client.embeddings.create(input=text, model="databricks-gte-large-en")
    return response.data[0].embedding


In [0]:
%sql
USE CATALOG `big_data_ii_2025`;
USE SCHEMA `spark_examples`;
SELECT id, text from test_text_embeddings;

In [0]:
from databricks.vector_search.client import VectorSearchClient


In [0]:
vsc = VectorSearchClient()

# Trigger indexing for the vector search index
# vsc.start_indexing(
#     index_name="big_data_ii_2025.spark_examples.test_vector_search_index"
# )

index = vsc.get_index(
    index_name="big_data_ii_2025.spark_examples.test_vector_search_index"
)
index.sync()


In [0]:

index.describe()

In [0]:
consulta = "¿Cuál es la capital de Inglaterra?"

query_vector = get_embedding(consulta)

index = vsc.get_index(
    index_name="big_data_ii_2025.spark_examples.test_vector_search_index"
)
results = index.similarity_search(
    query_vector=query_vector,
    num_results=2,
    columns=["id", "text"]
)

print(results)

In [0]:
# Extract the index of the "text" column
columns = results['manifest']['columns']
text_idx = next(i for i, col in enumerate(columns) if col['name'] == 'text')

# Concatenate all "text" values into a single string
context = " ".join(row[text_idx] for row in results['result']['data_array'])

print(context)

In [0]:
from openai import OpenAI
import os

# How to get your Databricks token: https://docs.databricks.com/en/dev-tools/auth/pat.html
# DATABRICKS_TOKEN = os.environ.get('DATABRICKS_TOKEN')
# Alternatively in a Databricks notebook you can use this:
# DATABRICKS_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()

client = OpenAI(
    api_key=DATABRICKS_TOKEN,
    base_url="https://dbc-504c50c9-8143.cloud.databricks.com/serving-endpoints"
)

response = client.chat.completions.create(
    model="databricks-meta-llama-3-1-8b-instruct",
    messages=[
        {
            "role": "user",
            "content": f"Contesta la pregunta: '{consulta}' usando la información de este contexto: '{context}'"
        }
    ]
)

print(f"La pregunta es '{consulta}'")
print(f"El contexto es '{context}'")
print("==========================================")
print(response.choices[0].message.content)
print("==========================================")
