In [None]:
ml_catalog = dbutils.widgets.get("ml_catalog")
ml_search_db = dbutils.widgets.get("ml_search_db")

df_click = spark.sql(f"""select * from {ml_catalog}.{ml_search_db}.ml_search_with_click
                     where action = 'actions' or action = 'people' """)

### Final Score Distribution

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [None]:
pdf = df_click.orderBy(F.rand()).limit(10000).toPandas()
pdf['resPos'] = pdf['resPos'].astype(int)
pdf[['resPos', 'finalScore']].groupby('resPos').describe().applymap(lambda x: '%.2f' % x)

In [None]:
pdf[['resPos', 'click']].groupby('resPos').describe().applymap(lambda x: '%.2f' % x)

### Analysis of Irrelavant Results

In [None]:
from pathlib import Path
import sys

current_dir = Path.cwd()
parent_dir = current_dir.parent.parent
sys.path.append(str(parent_dir))

In [None]:
from search.utils.data_exploration import drop_docs_analysis, compare_docs_with_clicks

In [None]:
df_click = drop_docs_analysis(df_click)

In [None]:
# display(df_click)

In [None]:
df_click.select('queryId').distinct().count()

In [None]:
df_aggregated = df_click.groupBy('resPos').agg(
    F.count('keep_label').alias('startDocs'),
    F.sum(F.col('keep_label')).alias('keepDocs'),
    F.sum(F.col('click')).alias('startClicks'),
    F.sum(F.col('keep_label') * F.col('click')).alias('keepClicks')
).orderBy('resPos')

df_aggregated = (df_aggregated
.withColumn(
    'KeepDoc %', F.round(((F.col('keepDocs') / F.col('startDocs')) * 100), 2))
.withColumn(
    'KeepClicks %', 
    F.when(F.col('startClicks') == 0, 100).otherwise(F.round((F.col('keepClicks') / F.col('startClicks')) * 100, 2)))
.withColumn(
    'Start CTR %',
    F.round(F.col('startClicks') / F.col('startDocs')*100, 2))
.withColumn(
    'After CTR %',
    F.round(F.col('keepClicks') / F.col('keepDocs')*100, 2))
    )

In [None]:
display(df_aggregated)

In [None]:
compare_docs_with_clicks(df_aggregated)

### Extreme Cases

In [None]:
df_dropped_clicks = (df_click
    .filter((F.col("click") == 1) & (F.col("keep_label") == 0))
    .withColumn("query", F.col("label"))  
    .withColumn("client_id", F.col("_token_client_id"))  
    .groupBy(
        F.col("resPos"),
        F.col("action"),
        F.col("query"),
        F.col("click_label"),
        F.col("client_id")
    )
    .agg(F.count("*").alias("count"))
    .orderBy(F.col("count").desc())
)

In [None]:
display(df_dropped_clicks)

In [None]:
collect_traceIds = (df_click
    .filter((F.col("click") == 1) & (F.col("keep_label") == 0))
    .select(F.col("action"),
            F.col("client_id"),
            F.col("traceId"), 
            F.col("resPos"),
            F.col("label"), 
            F.col("click_label"),
            F.col("finalScore")))

In [None]:
traceIds = (collect_traceIds
    .orderBy(F.col("finalScore").desc())
    .select(F.col("traceId"))
    .collect())

In [None]:
print(len(traceIds))

In [None]:
trace_ids_df = spark.createDataFrame(traceIds, ["traceId"])
display(trace_ids_df)

In [None]:
trace_ids = [row["traceId"] for row in traceIds]
# limit the number of trace_ids to 1024 to avoid exceeding the number of choices for the dropdown widget.
if len(trace_ids) > 1000:
    trace_ids = trace_ids[:1000]
default_trace_id = trace_ids[0] if len(trace_ids) > 0 else ""

In [None]:
dbutils.widgets.dropdown(
    "trace_id",
    default_trace_id,
    trace_ids
)
trace_id = dbutils.widgets.get("trace_id")
print(f"trace_id: {trace_id}")

In [None]:
display(spark.sql(f"""select 
                  traceId, 
                  resPos,
                  client_id,
                  label,
                  click_label,
                  click_category,
                  click_details_caption
                  from {ml_catalog}.{ml_search_db}.ml_search_with_click
                     where traceId = '{trace_id}'
                     order by resPos
                     """))

In [None]:
display(spark.sql(f"""select 
                  traceId, 
                  resPos,
                  client_id,
                  _id,
                  caption,
                  subtitle,
                  solrScore,
                  finalScore
                  from {ml_catalog}.{ml_search_db}.ml_search_action
                     where traceId = '{trace_id}'
                     order by resPos"""))

In [None]:
display(spark.sql(f"""select 
                  traceId, 
                  resPos,
                  client_id,
                  _id,
                  legalName, 
                  displayName, 
                  eID, 
                  location, 
                  position,
                  finalScore
                  from {ml_catalog}.{ml_search_db}.ml_search_people
                     where traceId = '{trace_id}'
                     order by resPos"""))

### Analysis of Irrelavant Results for Individual Client

In [None]:
client_ids = [row["_token_client_id"] for row in spark.sql(f"""select distinct _token_client_id from {ml_catalog}.{ml_search_db}.ml_search_with_click
                                                           where action = 'actions' or action = 'people' """).collect()]

dbutils.widgets.dropdown(
    "client_id",
    "002",
    client_ids
)
client_id = dbutils.widgets.get("client_id")
print(f"client_id: {client_id}")

In [None]:
df_aggregated_client = df_click.filter(F.col('_token_client_id') == client_id).groupBy('resPos').agg(
    F.count('keep_label').alias('startDocs'),
    F.sum(F.col('keep_label')).alias('keepDocs'),
    F.sum(F.col('click')).alias('startClicks'),
    F.sum(F.col('keep_label') * F.col('click')).alias('keepClicks')
).orderBy('resPos')

df_aggregated_client = (df_aggregated_client
.withColumn(
    'KeepDoc %', F.round(((F.col('keepDocs') / F.col('startDocs')) * 100), 2))
.withColumn(
    'KeepClicks %', 
    F.when(F.col('startClicks') == 0, 100).otherwise(F.round((F.col('keepClicks') / F.col('startClicks')) * 100, 2)))
.withColumn(
    'Start CTR %',
    F.round(F.col('startClicks') / F.col('startDocs')*100, 2))
.withColumn(
    'After CTR %',
    F.round(F.col('keepClicks') / F.col('keepDocs')*100, 2))
    )

In [None]:
compare_docs_with_clicks(df_aggregated_client)

### Extreme Cases

In [None]:
df_dropped_clicks = (df_click.filter(F.col('_token_client_id') == client_id)
    .filter((F.col("click") == 1) & (F.col("keep_label") == 0))
    .withColumn("query", F.col("label"))  
    .withColumn("client_id", F.col("_token_client_id"))  
    .groupBy(
        F.col("resPos"),
        F.col("action"),
        F.col("query"),
        F.col("click_label"),
        F.col("client_id")
    )
    .agg(F.count("*").alias("count"))
    .orderBy(F.col("count").desc())
)

In [None]:
display(df_dropped_clicks)