In [None]:
secret_scope = dbutils.widgets.get("secret_scope")

if secret_scope.split("-")[0] == "prod":
    dbutils.notebook.exit("Skip run in prod environment")

In [None]:
ml_catalog = dbutils.widgets.get("ml_catalog")
ml_search_db = dbutils.widgets.get("ml_search_db")

### Load Question and Answer Data

In [None]:
import json
import boto3
from langchain_community.document_loaders import PyPDFLoader

boto3_session = boto3.Session(
    botocore_session=dbutils.credentials.getServiceCredentialsProvider(
        'service-cred-nas-lifion_ml-sdq-dit'
    )
)

bucket_name = "ml-models-bucket-appbuild-02"
file_path = "evaluation_framework/data.pdf"

s3_client = boto3_session.client('s3') 
local_path = "file.pdf"
s3_client.download_file(bucket_name, file_path, local_path)

loader = PyPDFLoader(local_path)
documents = loader.load()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# set up the file loader/extractor and text splitter to create chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2500, chunk_overlap=0
)
chunks = loader.load_and_split(text_splitter)

print("Done preprocessing. Created", len(chunks), "chunks of the original pdf", len(documents))

In [None]:
import pandas as pd

df = pd.DataFrame(
    [chunk.page_content for i, chunk in enumerate(chunks)],
    columns=["reference"]
)

In [None]:
display(df)

In [None]:
from pathlib import Path
import sys

current_dir = Path.cwd()
parent_dir = current_dir.parent.parent.parent
sys.path.append(str(parent_dir))
print(parent_dir)

In [None]:
from src.search.utils.data_profiling_llm import get_bearer_token

client_secret = dbutils.widgets.get("client_secret")

bearer_token = get_bearer_token(client_secret)
print(bearer_token)

In [None]:
import time
from src.search.utils.data_profiling_llm import get_openai_embedding

embed_start_time = time.time()

df["embedding"] = df.reference.apply(lambda x: get_openai_embedding(client_secret, str(x)))

embed_time = time.time() - embed_start_time

In [None]:
print(f"Embedding took {embed_time} seconds")

In [None]:
from pyspark.sql import functions as F

yesterday = F.date_sub(F.current_date(), 1)

In [None]:
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled","true")
spark.sql(f"DROP TABLE IF EXISTS {ml_catalog}.{ml_search_db}.ml_qna_kc_embed")


# Add year, month, and day columns to the DataFrame
spark_df = spark.createDataFrame(df).withColumn("year", F.year(F.lit(yesterday))) \
                   .withColumn("month", F.year(F.lit(yesterday))) \
                   .withColumn("day", F.year(F.lit(yesterday)))

(spark_df
.write
.format("delta")
.mode("overwrite")
.option("mergeSchema", "true")
.partitionBy("year", "month", "day")
.saveAsTable(f"{ml_catalog}.{ml_search_db}.ml_qna_kc_embed"))

### LLM Embedding

In [None]:
df_qna = spark.sql("select * from onedata_us_east_1_shared_dit.nas_raw_lyric_search_dit.ml_qna_kc_embed")
display(df_qna)

In [None]:
pdf = df_qna.toPandas()

In [None]:
import numpy as np

pdf['embedding'] = pdf['embedding'].apply(lambda x: np.array(x).astype(np.float32).tobytes())

In [None]:
!rvl version

In [None]:
import redis

# Redis connection params
REDIS_HOST = 'search01d.us.caas.oneadp.com'
REDIS_PORT = 443
REDIS_PASSWORD = dbutils.widgets.get("redis_secret")

# Create Redis client
redis_client = redis.Redis(
  host=REDIS_HOST,
  port=REDIS_PORT,
  password=REDIS_PASSWORD,
  decode_responses=True, 
  ssl_cert_reqs="none", 
  ssl=True)
# Test connection
redis_client.ping()

In [None]:
from redisvl.schema import IndexSchema
from redisvl.index import SearchIndex

index_name = "question_answer_pdf_v0"

schema = IndexSchema.from_dict({
  "index": {
    "name": index_name,
    "prefix": index_name,
    "storage_type": "hash"
  },
  "fields": [
    {"type" : "text", "name" : "reference"},
    {
        "type" : "vector",
        "name" : "embedding",
        "attrs" : {
            "dims": 3072,
            "distance_metric": "cosine",
            "algorithm": "flat",
            "datatype": "float32"
        }
    }
  ],
})

index = SearchIndex(schema, redis_client)
index.create(overwrite=True, drop=True)

In [None]:
index.load(pdf.to_dict(orient="records"))

In [None]:
index.info()['num_docs']

In [None]:
from redisvl.query import VectorQuery

user_query = "How do I report a workplace safety concern? | If you observe or become aware of any unsafe condition, you must promptly notify your supervisor and the Human Resources Department so that the Company can take action to correct it."
vector = get_openai_embedding(client_secret, user_query)

In [None]:
v = VectorQuery(
    vector=np.array(vector).astype(np.float32).tobytes(),
    vector_field_name="embedding",
    num_results=3,
    return_fields=['reference'],
    return_score=True
)

results = index.query(v)
display(results)