In [None]:
secret_scope = dbutils.widgets.get("secret_scope")

if secret_scope.split("-")[0] == "prod":
    dbutils.notebook.exit("Skip run in prod environment")

In [None]:
ml_catalog = dbutils.widgets.get("ml_catalog")
ml_search_db = dbutils.widgets.get("ml_search_db")

### Load Question and Answer Data

In [None]:
import json
import boto3

boto3_session = boto3.Session(
    botocore_session=dbutils.credentials.getServiceCredentialsProvider(
        'service-cred-nas-lifion_ml-sdq-dit'
    )
)

bucket_name = "ml-models-bucket-appbuild-02"
file_path = "evaluation_framework/raw-qna-data.json"

s3_client = boto3_session.client('s3') 
response = s3_client.get_object(Bucket=bucket_name, Key=file_path)
data = json.load(response['Body'])

In [None]:
len(data)

In [None]:
cols = ['id', 'description_t', 'metadata_s', 'type_t', 'clientId_ss']
df = spark.createDataFrame(data)[cols]

In [None]:
df.groupBy(['type_t']).count().show()

In [None]:
from pyspark.sql.functions import concat_ws

df_qna = df.withColumn(
    "combined", 
    concat_ws(" | ", df["id"], df['metadata_s'])
).filter(df.type_t == "qna")

In [None]:
display(df_qna)

In [None]:
# pdf = df_qna.sample(fraction=0.01).toPandas()
pdf =  df_qna.toPandas()

In [None]:
from pathlib import Path
import sys

current_dir = Path.cwd()
parent_dir = current_dir.parent.parent.parent
sys.path.append(str(parent_dir))
print(parent_dir)

In [None]:
from src.search.utils.data_profiling_llm import get_bearer_token

client_secret = dbutils.widgets.get("client_secret")

bearer_token = get_bearer_token(client_secret)
print(bearer_token)

In [None]:
import time
from src.search.utils.data_profiling_llm import get_openai_embedding

embed_start_time = time.time()

pdf["embedding"] = pdf.combined.apply(lambda x: get_openai_embedding(client_secret, x))

embed_time = time.time() - embed_start_time

In [None]:
print(f"Embedding took {embed_time} seconds")

In [None]:
from pyspark.sql import functions as F

yesterday = F.date_sub(F.current_date(), 1)

In [None]:
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled","true")
spark.sql(f"DROP TABLE IF EXISTS {ml_catalog}.{ml_search_db}.ml_qna_embed")


# Add year, month, and day columns to the DataFrame
spark_df = spark.createDataFrame(pdf).withColumn("year", F.year(F.lit(yesterday))) \
                   .withColumn("month", F.year(F.lit(yesterday))) \
                   .withColumn("day", F.year(F.lit(yesterday)))

(spark_df
.write
.format("delta")
.mode("overwrite")
.option("mergeSchema", "true")
.partitionBy("year", "month", "day")
.saveAsTable(f"{ml_catalog}.{ml_search_db}.ml_qna_embed"))

### LLM Embedding

In [None]:
df_qna = spark.sql("select * from onedata_us_east_1_shared_dit.nas_raw_lyric_search_dit.ml_qna_embed")
display(df_qna)

In [None]:
pdf = df_qna.toPandas()

In [None]:
pdf.rename(columns={'id': 'qna_id'}, inplace=True)
pdf_exploded = pdf.explode('clientId_ss')
pdf_exploded = pdf_exploded[['qna_id', 'clientId_ss','description_t', 'metadata_s', 'embedding']]
pdf_exploded = pdf_exploded.dropna()

In [None]:
import numpy as np

pdf_exploded['embedding'] = pdf_exploded['embedding'].apply(lambda x: np.array(x).astype(np.float32).tobytes())

In [None]:
!rvl version

In [None]:
import redis

# Redis connection params
REDIS_HOST = 'search01d.us.caas.oneadp.com'
REDIS_PORT = 443
REDIS_PASSWORD = dbutils.widgets.get("redis_secret")

# Create Redis client
redis_client = redis.Redis(
  host=REDIS_HOST,
  port=REDIS_PORT,
  password=REDIS_PASSWORD,
  decode_responses=True, 
  ssl_cert_reqs="none", 
  ssl=True)
# Test connection
redis_client.ping()

In [None]:
from redisvl.schema import IndexSchema
from redisvl.index import SearchIndex

index_name = "question_answer_json_v0"

schema = IndexSchema.from_dict({
  "index": {
    "name": index_name,
    "prefix": index_name,
    "storage_type": "hash"
  },
  "fields": [
    {"type" : "tag", "name" : "qna_id"},
    {"type" : "tag", "name" : "clientId_ss"},
    {"type" : "tag", "name" : "description_t"},
    {"type" : "text", "name" : "metadata_s"},
    {
        "type" : "vector",
        "name" : "embedding",
        "attrs" : {
            "dims": 3072,
            "distance_metric": "cosine",
            "algorithm": "flat",
            "datatype": "float32"
        }
    }
  ],
})

index = SearchIndex(schema, redis_client)
index.create(overwrite=True, drop=True)

In [None]:
index.load(pdf_exploded.to_dict(orient="records"))

In [None]:
index.info()['num_docs']

In [None]:
from redisvl.query import VectorQuery
from redisvl.query.filter import Tag

user_query = "401k"
vector = get_openai_embedding(client_secret, user_query)

In [None]:
t = Tag("clientId_ss") == "3f346c64-d57b-4b0e-adaa-75be4a3165ca"

v = VectorQuery(
    vector=np.array(vector).astype(np.float32).tobytes(),
    vector_field_name="embedding",
    return_fields=['qna_id', 'description_t', 'metadata_s', 'type_t', 'clientId_ss'],
    filter_expression=t
)

results = index.query(v)
display(results)