In [None]:
ml_catalog = dbutils.widgets.get("ml_catalog")
ml_search_db = dbutils.widgets.get("ml_search_db")
df_action = spark.sql(f"select * from {ml_catalog}.{ml_search_db}.ml_search_action")

### Final Score Distribution for Search Results

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [None]:
pdf = df_action.orderBy(F.rand()).limit(10000).toPandas()
pdf['resPos'] = pdf['resPos'].astype(int)
pdf[['resPos', 'finalScore']].groupby('resPos').describe().applymap(lambda x: '%.2f' % x)

### Analysis of Irrelavant Results

In [None]:
from pathlib import Path
import sys

current_dir = Path.cwd()
parent_dir = current_dir.parent.parent
sys.path.append(str(parent_dir))

In [None]:
from search.utils.data_exploration import drop_docs_analysis, compare_docs_with_clicks

In [None]:
df_action = drop_docs_analysis(df_action)

In [None]:
# display(df_action)

In [None]:
df_action.select('queryId').distinct().count()

In [None]:
df_aggregated = df_action.groupBy('resPos').agg(
    F.count('keep_label').alias('startDocs'),
    F.sum(F.col('keep_label')).alias('keepDocs')
).orderBy('resPos')

df_aggregated = df_aggregated.withColumn(
    'keepDocs %', (F.col('keepDocs') / F.col('startDocs')) * 100
)

In [None]:
display(df_aggregated)

In [None]:
# Calculate the total sums
total_sums = df_aggregated.agg(
    F.sum('startDocs').alias('totalstartDocs'),
    F.sum('keepDocs').alias('totalkeepDocs')
).collect()[0]

# Calculate the percentage
total_start_doc_count = total_sums['totalstartDocs']
total_final_doc_count = total_sums['totalkeepDocs']
percentage = (total_final_doc_count / total_start_doc_count) * 100

if total_start_doc_count > 0:
    percentage = (1 - (total_final_doc_count / total_start_doc_count)) * 100
    print(f"Total Dropped Doc%: {percentage:.2f}%")
else:
    print("No documents were dropped.")

### Analysis of Irrelavant Search Results for Individual Client

In [None]:
client_ids = [row["_token_client_id"] for row in spark.sql(f"select distinct _token_client_id from {ml_catalog}.{ml_search_db}.ml_search_action").collect()]

print(f"number of clients: {len(client_ids)}")

dbutils.widgets.dropdown(
    "client_id",
    "002",
    client_ids
)
client_id = dbutils.widgets.get("client_id")
print(f"Analysis of client_id: {client_id}")

In [None]:
df_aggregated_client = df_action.filter(F.col('_token_client_id') == client_id).groupBy('resPos').agg(
    F.count('keep_label').alias('startDocs'),
    F.sum(F.col('keep_label')).alias('keepDocs'),
).orderBy('resPos')

df_aggregated_client = df_aggregated_client.withColumn(
    'keepDocs %', F.round((F.col('keepDocs') / F.col('startDocs')) * 100, 2)
)
display(df_aggregated_client)

In [None]:
# Calculate the total sums
total_sums = df_aggregated_client.agg(
    F.sum('startDocs').alias('totalstartDocs'),
    F.sum('keepDocs').alias('totalkeepDocs')
).collect()[0]

# Calculate the percentage
total_start_doc_count = total_sums['totalstartDocs']
total_final_doc_count = total_sums['totalkeepDocs']

if total_start_doc_count > 0:
    percentage = (1 - (total_final_doc_count / total_start_doc_count)) * 100
    print(f"Total Dropped Doc%: {percentage:.2f}%")
else:
    print("No documents were dropped.")

## Final Score Distribution for Click Results

In [None]:
df_click = spark.sql(f"""select * from {ml_catalog}.{ml_search_db}.ml_search_with_click
                     where action = 'actions' """)

In [None]:
df_click = drop_docs_analysis(df_click)

In [None]:
# display(df_click)

In [None]:
df_click.select('queryId').distinct().count()

In [None]:
df_aggregated = df_click.groupBy('resPos').agg(
    F.count('keep_label').alias('startDocs'),
    F.sum(F.col('keep_label')).alias('keepDocs'),
    F.sum(F.col('click')).alias('startClicks'),
    F.sum(F.col('keep_label') * F.col('click')).alias('keepClicks')
).orderBy('resPos')

df_aggregated = (df_aggregated
.withColumn(
    'KeepDoc %', F.round(((F.col('keepDocs') / F.col('startDocs')) * 100), 2))
.withColumn(
    'KeepClicks %', 
    F.when(F.col('startClicks') == 0, 100).otherwise(F.round((F.col('keepClicks') / F.col('startClicks')) * 100, 2)))
.withColumn(
    'Start CTR %',
    F.round(F.col('startClicks') / F.col('startDocs')*100, 2))
.withColumn(
    'After CTR %',
    F.round(F.col('keepClicks') / F.col('keepDocs')*100, 2))
    )

In [None]:
display(df_aggregated)

In [None]:
compare_docs_with_clicks(df_aggregated)

### Analysis of Irrelavant Results for Individual Client

In [None]:
client_ids = [row["_token_client_id"] for row in spark.sql(f"""select distinct _token_client_id from {ml_catalog}.{ml_search_db}.ml_search_with_click where action = 'actions' """).collect()]

dbutils.widgets.dropdown(
    "client_id",
    "002",
    client_ids
)
client_id = dbutils.widgets.get("client_id")
print(f"client_id: {client_id}")

In [None]:
df_aggregated_client = df_click.filter(F.col('_token_client_id') == client_id).groupBy('resPos').agg(
    F.count('keep_label').alias('startDocs'),
    F.sum(F.col('keep_label')).alias('keepDocs'),
    F.sum(F.col('click')).alias('startClicks'),
    F.sum(F.col('keep_label') * F.col('click')).alias('keepClicks')
).orderBy('resPos')

df_aggregated_client = (df_aggregated_client
.withColumn(
    'KeepDoc %', F.round(((F.col('keepDocs') / F.col('startDocs')) * 100), 2))
.withColumn(
    'KeepClicks %', 
    F.when(F.col('startClicks') == 0, 100).otherwise(F.round((F.col('keepClicks') / F.col('startClicks')) * 100, 2)))
.withColumn(
    'Start CTR %',
    F.round(F.col('startClicks') / F.col('startDocs')*100, 2))
.withColumn(
    'After CTR %',
    F.round(F.col('keepClicks') / F.col('keepDocs')*100, 2))
    )

In [None]:
compare_docs_with_clicks(df_aggregated_client)