In [None]:
ml_catalog = dbutils.widgets.get("ml_catalog")
ml_search_db = dbutils.widgets.get("ml_search_db")

df_action = spark.sql(f"""
    SELECT _id, caption, subtitle, count(*) AS views
    FROM {ml_catalog}.{ml_search_db}.ml_search_action
    WHERE context='US'
    GROUP BY 1, 2, 3
    ORDER BY 4 DESC
""")

### Display Top Search Action Views

In [None]:
display(df_action)

###  Tokenize and Normalize

In [None]:
import string

In [None]:
pdf_action = df_action.toPandas()
pdf_action['combined'] = pdf_action.apply(
    lambda x: x.caption if x.caption.lower() == x.subtitle.lower() else ','.join([x.caption, x.subtitle]), 
    axis=1
)
documents = "\n\n".join(pdf_action.combined)
documents[:100]

In [None]:
from pathlib import Path
import sys

current_dir = Path.cwd()
parent_dir = current_dir.parent.parent
sys.path.append(str(parent_dir))

In [None]:
from search.utils.data_profiling_nlp import CustomTextSplitter

splitter = CustomTextSplitter(separator="\n\n")
processed_tokens = splitter.split_text(documents)

### Extract Corpus

In [None]:
from collections import Counter
import pandas as pd

words = ','.join(processed_tokens)
w = words.split(',')

most_common_keywords = Counter(w).most_common(100)
sorted_keywords = sorted(most_common_keywords, key=lambda x: x[0])
top_keywords = pd.DataFrame(sorted_keywords, columns=['keyword', 'count'])
display(top_keywords)

In [None]:
from typing import List
import nltk

# Compute frequency distribution and get top K
keyword_chunks: List[str] = []
for tokens in processed_tokens:
    token_list = tokens.split(",")
    freq_dist = nltk.FreqDist(token_list)
    top_keywords = [word for word, _ in freq_dist.most_common(5)]
    keyword_chunks.append(",".join(top_keywords))
pdf_action['keywords'] = keyword_chunks
pdf_action.drop(['combined'], axis=1, inplace=True)
display(pdf_action)

### Extract Keywords by BM25

In [None]:
passages = processed_tokens
print(f"--{passages[0]}\n--{passages[1]}")

In [None]:
from rank_bm25 import BM25Okapi


def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.split(','):
        token = token.strip(string.punctuation)
        if len(token) > 0:
            tokenized_doc.append(token)
    return tokenized_doc

tokenized_corpus = []
for passage in passages:
    tokenized_corpus.append(bm25_tokenizer(passage))

# #Create a BM25 index from the tokenized document corpus
bm25 = BM25Okapi(tokenized_corpus)

In [None]:
import numpy as np


def search(query, top_k=3, num_candidates=100):
    print("Input question:", query)

    ##### BM25 search (lexical search) #####
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -num_candidates)[-num_candidates:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    
    print(f"\nTop-3 lexical search (BM25) hits")
    print("-----------------------------------")

    for hit in bm25_hits[0:top_k]:
        print("\t{:.3f}\t{}".format(hit['score'],passages[hit['corpus_id']].replace("\n", " ")))

In [None]:
search("dashboard", top_k=3, num_candidates=100)

### LLM Embedding

In [None]:
from pyspark.sql.functions import concat_ws

df_action = df_action.withColumn(
    "combined", 
    concat_ws(" | ", df_action["caption"], df_action["subtitle"])
)
# df = df_action.select("combined").dropDuplicates()
# pdf = df.sample(fraction=0.1).toPandas()
pdf = df_action.toPandas()

In [None]:
from search.utils.data_profiling_llm import get_bearer_token

client_secret = dbutils.widgets.get("client_secret")
bearer_token = get_bearer_token(client_secret)
print(bearer_token)

In [None]:
import time
from search.utils.data_profiling_llm import get_openai_embedding

embed_start_time = time.time()

pdf["embedding"] = pdf.combined.apply(lambda x: get_openai_embedding(client_secret, x))

embed_time = time.time() - embed_start_time
print(f"Embedding took {embed_time} seconds")

In [None]:
import numpy as np

pdf['embedding'] = pdf['embedding'].apply(lambda x: np.array(x).astype(np.float32).tobytes())

In [None]:
from pyspark.sql import functions as F

yesterday = F.date_sub(F.current_date(), 1)

In [None]:
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled","true")
spark.sql(f"DROP TABLE IF EXISTS {ml_catalog}.{ml_search_db}.ml_search_action_embed")


# Add year, month, and day columns to the DataFrame
spark_df = spark.createDataFrame(pdf).withColumn("year", F.year(F.lit(yesterday))) \
                   .withColumn("month", F.year(F.lit(yesterday))) \
                   .withColumn("day", F.year(F.lit(yesterday)))

(spark_df
.write
.format("delta")
.mode("overwrite")
.option("mergeSchema", "true")
.partitionBy("year", "month", "day")
.saveAsTable(f"{ml_catalog}.{ml_search_db}.ml_search_action_embed"))

In [None]:
secret_scope = dbutils.widgets.get("secret_scope")

if secret_scope.split("-")[0] == "prod":
    dbutils.notebook.exit("Skip run in prod environment")

### Redis index

In [None]:
!rvl version

In [None]:
import redis

# Redis connection params
REDIS_HOST = 'search01d.us.caas.oneadp.com'
REDIS_PORT = 443
REDIS_PASSWORD = dbutils.widgets.get("redis_secret")

# Create Redis client
redis_client = redis.Redis(
  host=REDIS_HOST,
  port=REDIS_PORT,
  password=REDIS_PASSWORD,
  decode_responses=True, 
  ssl_cert_reqs="none", 
  ssl=True)
# Test connection
redis_client.ping()

In [None]:
# redis_client.flushall()

In [None]:
from redisvl.schema import IndexSchema
from redisvl.index import SearchIndex

index_name = "action_links"

schema = IndexSchema.from_dict({
  "index": {
    "name": index_name,
    "prefix": index_name,
    "storage_type": "hash"
  },
  "fields": [
    {"type" : "tag", "name" : "caption", "attrs": {"sortable": True}},
    {"type" : "text", "name" : "subtitle"},
    {"type" : "text", "name" : "combined"},
    {"type" : "numeric", "name" : "views", "attrs": {"sortable": True}},
    {
        "type" : "vector",
        "name" : "embedding",
        "attrs" : {
            "dims": 3072,
            "distance_metric": "cosine",
            "algorithm": "flat",
            "datatype": "float32"
        }
    }
  ],
})

index = SearchIndex(schema, redis_client)
index.create(overwrite=True, drop=True)

In [None]:
index.load(pdf.to_dict(orient="records"))

In [None]:
redis_client.dbsize()

### Vector Search Approaches

In [None]:
user_query = "Mana"
vector = get_openai_embedding(client_secret, user_query)

In [None]:
from redisvl.query import VectorQuery

vec_query = VectorQuery(
    vector=np.array(vector).astype(np.float32).tobytes(),
    vector_field_name="embedding",
    num_results=5,
    return_fields=["caption", "subtitle"],
    return_score=True,
)

result = index.query(vec_query)
pd.DataFrame(result)

In [None]:
from redisvl.query.filter import Tag

tag_filter = Tag("caption") == "Asset Management"

vec_query.set_filter(tag_filter)

result = index.query(vec_query)
pd.DataFrame(result)

In [None]:
from redisvl.query.filter import Num

# build combined filter expressions
tag_filter = Tag("caption") == "Asset Management"
num_filter = Num("views") >= 2
combined_filter = tag_filter & num_filter

# build vector query
vec_query = VectorQuery(
    vector=np.array(vector).astype(np.float32).tobytes(),
    vector_field_name="embedding",
    num_results=5,
    return_fields=["caption", "subtitle"],
    return_score=True,
    filter_expression=combined_filter
)

result=index.query(vec_query)
pd.DataFrame(result)

In [None]:
from redisvl.query.filter import Text

text_filter = Text("subtitle") % "Asset Management"

vec_query = VectorQuery(
    vector=np.array(vector).astype(np.float32).tobytes(),
    vector_field_name="embedding",
    num_results=3,
    return_fields=["caption", "subtitle"],
    return_score=True,
    filter_expression=text_filter
)

result = index.query(vec_query)
pd.DataFrame(result)

In [None]:
text_filter =  Text("combined") % "mana*"

vec_query = VectorQuery(vector=np.array(vector).astype(np.float32).tobytes(),
    vector_field_name="embedding",
    num_results=10,
    return_fields=["caption", "subtitle", "combined"],
    return_score=True,
    filter_expression=text_filter
)

result = index.query(vec_query)
pd.DataFrame(result)

In [None]:
text_filter =  Text("combined") % "%mana%"

vec_query = VectorQuery(vector=np.array(vector).astype(np.float32).tobytes(),
    vector_field_name="embedding",
    num_results=10,
    return_fields=["caption", "subtitle", "combined"],
    return_score=True,
    filter_expression=text_filter
)

result = index.query(vec_query)
pd.DataFrame(result)

### Range Query

In [None]:
from redisvl.query import RangeQuery

range_query = RangeQuery(
    vector=np.array(vector).astype(np.float32).tobytes(),
    vector_field_name="embedding",
    return_fields=["caption", "subtitle", "combined"],
    return_score=True,
    distance_threshold=0.8
)

result = index.query(range_query)
pd.DataFrame(result)

In [None]:
range_query = RangeQuery(
    vector=np.array(vector).astype(np.float32).tobytes(),
    vector_field_name="embedding",
    return_fields=["caption", "subtitle", "combined"],
    distance_threshold=0.8
)

numeric_filter = Num("views") >= 20

range_query.set_filter(numeric_filter)

# in this case we want to do a simple filter search or the vector so we execute as a joint filter directly
result = index.query(range_query)
pd.DataFrame(result)

### Hybrid Search Approaches

In [None]:
from redisvl.query import HybridQuery

user_query = "Manage"
vector = get_openai_embedding(client_secret, user_query)

hybrid_query = HybridQuery(
    text=user_query,
    text_field_name="subtitle",
    text_scorer="BM25",
    vector=np.array(vector).astype(np.float32).tobytes(),
    vector_field_name="embedding",
    return_fields=["caption", "subtitle", "combined"],
)

result = index.query(hybrid_query)
pd.DataFrame(result)

In [None]:
hybrid_query = HybridQuery(
    text=user_query,
    text_field_name="subtitle",
    text_scorer="BM25",
    vector=np.array(vector).astype(np.float32).tobytes(),
    vector_field_name="embedding",
    alpha=0.7, # weight the vector score lower
    num_results=20,
    return_fields=["caption", "subtitle", "combined"],
)

result = index.query(hybrid_query)
pd.DataFrame(result)