In [None]:
ml_catalog = dbutils.widgets.get("ml_catalog")
ml_search_db = dbutils.widgets.get("ml_search_db")
df_people = spark.sql(f"select * from {ml_catalog}.{ml_search_db}.ml_search_people")

In [None]:
display(df_people)

### Display Top Search People Views

In [None]:
df_people = spark.sql(f"""
    SELECT _token_client_id AS client_id, 
        _id, 
        eID, 
        location, 
        position, 
        count(*) AS views
    FROM {ml_catalog}.{ml_search_db}.ml_search_people
    WHERE context='US'
    GROUP BY 1, 2, 3, 4, 5
    ORDER BY 6 desc
""")
display(df_people)

In [None]:
dbutils.data.summarize(df_people)

In [None]:
from pyspark.sql.functions import col, sum, round

tot = df_people.agg(sum("views")).collect()[0][0]
df_people.groupBy("client_id") \
    .agg(sum("views").alias("total_views")) \
    .withColumn("percent", round(col("total_views") / tot * 100, 2)) \
    .orderBy("total_views", ascending=False) \
    .show()

In [None]:
df_people.groupBy("location") \
    .agg(sum("views").alias("total_views")) \
    .withColumn("percent", round(col("total_views") / tot * 100, 2)) \
    .orderBy("total_views", ascending=False) \
    .show()

In [None]:
df_people.groupBy("position") \
    .agg(sum("views").alias("total_views")) \
    .withColumn("percent", round(col("total_views") / tot * 100, 2)) \
    .orderBy("total_views", ascending=False) \
    .show()

### LLM Embedding

In [None]:
from pyspark.sql.functions import concat_ws

df_people = df_people.withColumn("combined", concat_ws(" | ", df_people["client_id"], df_people["location"], df_people["position"]))
df = df_people.select("combined").dropDuplicates()
pdf = df.toPandas()
display(pdf)

In [None]:
from pathlib import Path
import sys

current_dir = Path.cwd()
parent_dir = current_dir.parent.parent
sys.path.append(str(parent_dir))

cert_path = parent_dir / "seach"/ "utils" / "ADP_Internal_Root_CA_GN2.pem"

In [None]:

from search.utils.data_profiling_llm import get_bearer_token

client_secret = dbutils.widgets.get("client_secret")
bearer_token = get_bearer_token(client_secret)
print(bearer_token)

In [None]:
from search.utils.data_profiling_llm import invoke_titan_model

embr = invoke_titan_model(client_secret, "testing embedding")
print(embr)

In [None]:
from search.utils.data_profiling_llm import get_openai_embedding

embr = get_openai_embedding(client_secret, "testing embedding")
print(embr)

In [None]:
secret_scope = dbutils.widgets.get("secret_scope")

if secret_scope.split("-")[0] == "prod":
    dbutils.notebook.exit("No need to run the following code in Prod environment")

In [None]:
import time

embed_start_time = time.time()

pdf = pdf.sample(frac=0.01, replace=False).reset_index(drop=True)
pdf["embedding"] = pdf.combined.apply(lambda x: get_openai_embedding(client_secret, x))

embed_time = time.time() - embed_start_time
print(f"Embedding took {embed_time} seconds")

In [None]:
data_csv = parent_dir / "search"/ "data" / "embed.csv"
pdf.to_csv(data_csv, index=False)

In [None]:
from pyspark.sql import functions as F

yesterday = F.date_sub(F.current_date(), 1)

In [None]:
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled","true")
spark.sql(f"DROP TABLE IF EXISTS {ml_catalog}.{ml_search_db}.ml_search_people_embed")

spark_df = spark.createDataFrame(pdf).withColumn("year", F.year(F.lit(yesterday))) \
    .withColumn("month", F.month(F.lit(yesterday))) \
    .withColumn("day", F.day(F.lit(yesterday))) 


(spark_df
.write
.format("delta")
.mode("overwrite")
.option("mergeSchema", "true")
.partitionBy("year", "month", "day")
.saveAsTable(f"{ml_catalog}.{ml_search_db}.ml_search_people_embed"))

### Dimension Reduction

In [None]:
# import pandas as pd
# from ast import literal_eval
# import numpy as np

# df = pd.read_csv(data_csv)
# df["embedding"] = df.embedding.apply(literal_eval).apply(np.array)

In [None]:
# matrix = np.vstack(df.embedding.values)
# print(matrix.shape)

In [None]:
# from search.utils.data_profiling_llm import global_cluster_embeddings

# reduced_embeddings = global_cluster_embeddings(matrix, dim=50)
# print(reduced_embeddings.shape)

In [None]:
# from hdbscan import HDBSCAN

# hdbscan_model = HDBSCAN(min_cluster_size=25, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# hdbscan_model.fit(reduced_embeddings)
# labels = hdbscan_model.labels_
# labels.shape

In [None]:
# for i in list(set(labels)):
#     print("--------------------")
#     print(f"Cluster {i}")
#     for index in np.where(labels==i)[0][:10]:
#         print(df.iloc[index]['combined'])