In [None]:
ml_catalog = dbutils.widgets.get("ml_catalog")
ml_search_db = dbutils.widgets.get("ml_search_db")

df_action = spark.sql(f"""
with click as (
    select request_correlation_id,
        _token_session_id, 
        _token_associate_id, 
        object_id, 
        time_stamp, 
        label, 
        client_id, 
        category, 
        details_caption 
    from {ml_catalog}.{ml_search_db}.ml_search_click 
    where request_correlation_id is not null and lower(request_correlation_id) != 'nan'
),

search_click as (
        select search.request_correlation_id,
        search._token_client_id as client_id,
        search.label AS query,
        search.resPos,
        search.traceId,
        search.caption,
        search.subtitle,
        search.solrScore,
        search.finalScore,
        rank() over (partition by search.request_correlation_id, search.resPos, search.traceId order by click.time_stamp) as rank,
        1 AS click_count,
        click._token_session_id as click_session_id, 
        click._token_associate_id as click_associate_id, 
        click.object_id as click_object_id, 
        click.time_stamp as click_time_stamp, 
        click.label as click_label, 
        click.client_id as click_client_id, 
        click.category as click_category, 
        click.details_caption as click_details_caption
    from {ml_catalog}.{ml_search_db}.ml_search_action search
    inner join click
    on search.request_correlation_id = click.request_correlation_id
    and search._id = click.object_id
),

click_aggregation AS (
    SELECT query,
           caption,
           subtitle,
           sum(click_count) AS total_clicks,
           DENSE_RANK() OVER (PARTITION BY query ORDER BY sum(click_count) DESC) AS action_rank
    FROM search_click
    WHERE rank = 1
    GROUP BY query, caption, subtitle
)

SELECT query,
        caption,
        subtitle,
        total_clicks,
        action_rank,
        max(action_rank) over (PARTITION BY query) AS max_action_rank
FROM click_aggregation
""")

In [None]:
from pyspark.sql.functions import col

filtered_df = (df_action.filter((col("max_action_rank") >= 3) & (col("action_rank") <= 3)) 
    .select("query", "caption", "subtitle", "total_clicks", "action_rank", "max_action_rank") 
    .orderBy(["query", "action_rank"]))

display(filtered_df)

### Top Keywords

In [None]:
pdf_action = df_action.toPandas()
pdf_action['combined'] = pdf_action.apply(
    lambda x: x.caption if x.caption.lower() == x.subtitle.lower() else ','.join([x.caption, x.subtitle]), 
    axis=1
)
documents = "\n\n".join(pdf_action.combined)
documents[:100]

In [None]:
from pathlib import Path
import sys

current_dir = Path.cwd()
parent_dir = current_dir.parent.parent
sys.path.append(str(parent_dir))

In [None]:
from search.utils.data_profiling_nlp import CustomTextSplitter

splitter = CustomTextSplitter(separator="\n\n")
processed_tokens = splitter.split_text(documents)

In [None]:
from collections import Counter
import pandas as pd

words = ','.join(processed_tokens)
w = words.split(',')

most_common_keywords = Counter(w)

In [None]:
df = pd.DataFrame(most_common_keywords.most_common(100), columns=['keyword', 'count'])
df = df.sort_values(by='count', ascending=False)
display(df)

In [None]:
secret_scope = dbutils.widgets.get("secret_scope")

if secret_scope.split("-")[0] == "prod":
    dbutils.notebook.exit("Skip run in prod environment")

### Redis Auto Suggest

In [None]:
import redis

# Redis connection params
REDIS_HOST = 'search01d.us.caas.oneadp.com'
REDIS_PORT = 443
REDIS_PASSWORD = dbutils.widgets.get("redis_secret")

# Create Redis client
redis_client = redis.Redis(
  host=REDIS_HOST,
  port=REDIS_PORT,
  password=REDIS_PASSWORD,
  decode_responses=True, 
  ssl_cert_reqs="none", 
  ssl=True)
# Test connection
redis_client.ping()

In [None]:
redis_client.dbsize()

In [None]:
# redis_client.flushall()

In [None]:
from redis.commands.search.suggestion import Suggestion

total = 0
BATCH_SIZE = 10_000

with redis_client.pipeline(transaction=False) as pipe:
    batch_count = 0
    for key, val in most_common_keywords.items():
        pipe.ft().sugadd(
            'top_action_keywords',
            Suggestion(key, float(val)),
            increment=True,
        )
        total += 1
        batch_count += 1

        if batch_count >= BATCH_SIZE:
            pipe.execute()
            batch_count = 0

    if batch_count > 0:
        pipe.execute()

print(f"Inserted/updated {total:,} keywords into 'top_action_keywords'")

In [None]:
from typing import List, Tuple

def get_suggestions(redis_client, prefix: str, max_results: int = 10, fuzzy: bool = True) -> List[Tuple[str, float]]:
    suggestions = redis_client.ft().sugget(
        'top_action_keywords',
        prefix,
        num=max_results,
        fuzzy=fuzzy,
        with_scores=True
    )
    
    if not suggestions:
        return []

    return [(s.string, s.score) for s in suggestions]

In [None]:
get_suggestions(redis_client, "ma")