In [0]:
ml_catalog = "onedata_us_east_1_shared_dit"
ml_search_db = "nas_raw_lyric_search_dit"

df_action = spark.sql(f"""
    SELECT _id, caption, subtitle, count(*) AS views
    FROM {ml_catalog}.{ml_search_db}.ml_search_action
    WHERE context='US'
    GROUP BY 1, 2, 3
    ORDER BY 4 DESC
""")

In [0]:
display(df_action)

_id,caption,subtitle,views
7843598ab66a41d28253f40b5e9a3c56,Manage Associate's Pay Profile,Maintain and capture Associate's pay data records within a legal entity and/or pay group.,1170
119fb43320ab44c9847c9b2284f7e848,View My Associate Profile,View My Associate Profile,1087
d0adbfea7381492e976ec090d26777ba,Manage Benefits Supplemental Fields,View and Manage Associate's benefit related supplemental eligibility and compliance values.,1036
899c324e3f2742ce9cd9d27a97048cc5,View Company Org Chart,View Company Org Chart,936
0429ea12a5974851ab47620c9d7205c9,View Criteria Control Center,"Setup and manage the search queries that give eligibility, approvals, authorizations and other apps the flexibility and power they need to evolve with your workforce.",864
df6e2b589b0d462ebcdfb01bae2106c3,Add My Personal Contacts,Add a contact to your profile,838
846b123c190448f8a7b82f942d7609bd,View and Manage Associates' Benefits,Manage changes to associates' Benefits,816
9925c8101c7b43e98eeb2c6539728764,Manage Payroll Company Data,Configure company pay data and maintain payroll settings within a legal entity and/or pay group.,772
6aa709dc9ad44b7ea7e61424086e2415,View Your Pay,Access your pay details.,760
2a1ec3581d634da888cc03ce2dd8d818,Benefits Data Management,"View and edit benefits data, add new records, and download benefits reports for all associates.",726


### Tokenize and Normalize

In [0]:
import re
import string

In [0]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [0]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [0]:
from typing import List
from nltk.stem import PorterStemmer, WordNetLemmatizer

class CustomTextSplitter:
    """A custom text splitter that extracts top keywords from each segment using NLTK."""

    def __init__(self, separator: str, **kwargs):
        """
        Initialize with a separator string and number of keywords to extract per segment.
        :param separator: The string to split the text on.
        """
        self._separator = separator
        self._stopwords = set(stopwords.words('english'))
        self._punct_table = str.maketrans('', '', string.punctuation)

    def normalize_text(self, text: str):
        # Replace line breaks (both types) with spaces
        text = re.sub(r'[\n\r]', ' ', text)

        # Replace special characters
        text = re.sub(r'#([[a-zA-Z]|[0-9]]+)', ' ', text)

        # Replace URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', text)

        # Replace user mentions
        text = re.sub(r'@(\w+)', ' ', text)

        # Convert to lowercase
        text = text.lower()

        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))

        # Replace two or more subsequent white spaces with a single space
        text = re.sub(r'\ {2,}', ' ', text)

        # Trim white spaces at the beginning or end
        text = text.strip()
        return ' ' if text is None else text

    def split_text(self, text: str) -> List[str]:
        """
        Split the input text using the separator, extract keywords from each segment,
        and return a list of comma-separated keyword strings per segment.
        :param text: The full text to split and analyze.
        :return: List of keyword strings for each segment.
        """
        segments = text.split(self._separator)
        keyword_chunks: List[str] = []
        for segment in segments:
            tokens = word_tokenize(segment.lower())
            cleaned = [t.translate(self._punct_table) for t in tokens]
            filtered_tokens = [self.normalize_text(t) for t in cleaned if t.isalpha() and t not in self._stopwords]

            # Initialize the stemmer and lemmatizer
            # stemmer = PorterStemmer()
            lemmatizer = WordNetLemmatizer()
            # processed_tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in filtered_tokens]
            processed_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
            keyword_chunks.append(",".join(processed_tokens))

            # Compute frequency distribution and get top K
            # freq_dist = nltk.FreqDist(processed_tokens)
            # top_keywords = [word for word, _ in freq_dist.most_common(self._top_k)]
            # keyword_chunks.append(",".join(top_keywords))

        return keyword_chunks

In [0]:
pdf_action = df_action.toPandas()
pdf_action['combined'] = pdf_action.apply(
    lambda x: x.caption if x.caption.lower() == x.subtitle.lower() else ','.join([x.caption, x.subtitle]), 
    axis=1
)
documents = "\n\n".join(pdf_action.combined)
documents[:100]

"Manage Associate's Pay Profile,Maintain and capture Associate's pay data records within a legal enti"

In [0]:
splitter = CustomTextSplitter(separator="\n\n")
processed_tokens = splitter.split_text(documents)

In [0]:
processed_tokens

['view,associate,profile',
 'manage,benefit,supplemental,field,view,manage,associate,benefit,related,supplemental,eligibility,compliance,value',
 'manage,associate,pay,profile,maintain,capture,associate,pay,data,record,within,legal,entity,andor,pay,group',
 'view,company,org,chart',
 'view,criterion,control,center,setup,manage,search,query,give,eligibility,approval,authorization,apps,flexibility,power,need,evolve,workforce',
 'add,personal,contact,add,contact,profile',
 'view,manage,associate,benefit,manage,change,associate,benefit',
 'benefit,data,management,view,edit,benefit,data,add,new,record,download,benefit,report,associate',
 'view,pay,access,pay,detail',
 'manage,payroll,company,data,configure,company,pay,data,maintain,payroll,setting,within,legal,entity,andor,pay,group',
 'edit,personal,demographic,edit,gender,race,nationality,marital,status,birth,detail',
 'create,criterion,create,new,search,query,app,domain',
 'hiring,dashboard,navigate,new,hire,dashboard,start,hire',
 'work

### Extract Corpus

In [0]:
from collections import Counter
import pandas as pd

words = ','.join(processed_tokens)
w = words.split(',')

most_common_keywords = Counter(w).most_common(100)
sorted_keywords = sorted(most_common_keywords, key=lambda x: x[0])
top_keywords = pd.DataFrame(sorted_keywords, columns=['keyword', 'count'])
display(top_keywords)

keyword,count
action,9
add,14
administrar,27
adp,12
afficher,26
agregar,7
ajouter,7
approval,9
associate,38
at,7


Databricks visualization. Run in Databricks to view.

### Extract Keywords by Frequency

In [0]:
passages = processed_tokens

In [0]:
print(f"--{passages[0]}\n--{passages[1]}")

--view,associate,profile
--manage,benefit,supplemental,field,view,manage,associate,benefit,related,supplemental,eligibility,compliance,value


In [0]:
!pip install rank_bm25

Looking in indexes: https://artifactory.us.caas.oneadp.com/artifactory/api/pypi/pypi/simple/
Collecting rank_bm25
  Downloading https://artifactory.us.caas.oneadp.com/artifactory/api/pypi/pypi/packages/packages/2a/21/f691fb2613100a62b3fa91e9988c991e9ca5b89ea31c0d3152a3210344f9/rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
from rank_bm25 import BM25Okapi


def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.split(','):
        token = token.strip(string.punctuation)
        if len(token) > 0:
            tokenized_doc.append(token)
    return tokenized_doc

tokenized_corpus = []
for passage in passages:
    tokenized_corpus.append(bm25_tokenizer(passage))

# #Create a BM25 index from the tokenized document corpus
bm25 = BM25Okapi(tokenized_corpus)

In [0]:
import numpy as np


def search(query, top_k=3, num_candidates=100):
    print("Input question:", query)

    ##### BM25 search (lexical search) #####
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -num_candidates)[-num_candidates:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    
    print(f"\nTop-3 lexical search (BM25) hits")
    print("-----------------------------------")

    for hit in bm25_hits[0:top_k]:
        print("\t{:.3f}\t{}".format(hit['score'],passages[hit['corpus_id']].replace("\n", " ")))

    # #Add re-ranking
    # docs = [passages[hit['corpus_id']] for hit in bm25_hits]
    # print(f"\nTop-3 hits by rank-API ({len(bm25_hits)} BM25 hits re-ranked)")
    # print("-------------------------------------------------")
    # results = co.rerank(query=query,documents=docs, top_n=top_k)
    
    # for hit in results:
    #     hit.index+=1
    #     print("\t{:.3f} was({})\t{}".format(hit.relevance_score, hit.index, hit.document["text"].replace("\n", " ")))

In [0]:
search("dashboard", top_k=3, num_candidates=100)

Input question: dashboard

Top-3 lexical search (BM25) hits
-----------------------------------
	4.771	view,hr,dashboard,hr,dashboard
	4.771	view,payroll,dashboard,payroll,dashboard
	4.566	registration,dashboard,dashboard,track,associate,registration



### Extract Keywords

In [0]:
# Compute frequency distribution and get top K
keyword_chunks: List[str] = []
for tokens in processed_tokens:
    token_list = tokens.split(",")
    freq_dist = nltk.FreqDist(token_list)
    top_keywords = [word for word, _ in freq_dist.most_common(5)]
    keyword_chunks.append(",".join(top_keywords))
pdf_action['keywords'] = keyword_chunks
pdf_action.drop(['combined'], axis=1, inplace=True)

In [0]:
display(pdf_action)

_id,caption,subtitle,views,keywords
119fb43320ab44c9847c9b2284f7e848,View My Associate Profile,View My Associate Profile,1298,"view,associate,profile"
d0adbfea7381492e976ec090d26777ba,Manage Benefits Supplemental Fields,View and Manage Associate's benefit related supplemental eligibility and compliance values.,1213,"manage,benefit,supplemental,field,view"
7843598ab66a41d28253f40b5e9a3c56,Manage Associate's Pay Profile,Maintain and capture Associate's pay data records within a legal entity and/or pay group.,1184,"pay,associate,manage,profile,maintain"
899c324e3f2742ce9cd9d27a97048cc5,View Company Org Chart,View Company Org Chart,1148,"view,company,org,chart"
0429ea12a5974851ab47620c9d7205c9,View Criteria Control Center,"Setup and manage the search queries that give eligibility, approvals, authorizations and other apps the flexibility and power they need to evolve with your workforce.",1001,"view,criterion,control,center,setup"
df6e2b589b0d462ebcdfb01bae2106c3,Add My Personal Contacts,Add a contact to your profile,998,"add,contact,personal,profile"
846b123c190448f8a7b82f942d7609bd,View and Manage Associates' Benefits,Manage changes to associates' Benefits,993,"manage,associate,benefit,view,change"
2a1ec3581d634da888cc03ce2dd8d818,Benefits Data Management,"View and edit benefits data, add new records, and download benefits reports for all associates.",965,"benefit,data,management,view,edit"
6aa709dc9ad44b7ea7e61424086e2415,View Your Pay,Access your pay details.,915,"pay,view,access,detail"
9925c8101c7b43e98eeb2c6539728764,Manage Payroll Company Data,Configure company pay data and maintain payroll settings within a legal entity and/or pay group.,881,"payroll,company,data,pay,manage"


In [0]:
import numpy as np

def search(query, top_k=3, num_candidates=100):
    print("Input question:", query)
    ##### BM25 search (lexical search) #####
    bm25_scores = bm25.get_scores(query)
    print(bm25_scores)
    top_n = np.argpartition(bm25_scores, -num_candidates)[-num_candidates:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    print(f"\nTop-3 lexical search (BM25) hits")
    print("-----------------------------------")
    for hit in bm25_hits[0:top_k]:
        print("\t{:.3f}\t{}".format(hit['score'], processed_tokens[hit['corpus_id']].replace("\n", " ")))


### Top 3 Query with Clicks

In [0]:
ml_catalog = "onedata_us_east_1_shared_dit"
ml_search_db = "nas_raw_lyric_search_dit"

df_action = spark.sql(f"""
with click as (
    select request_correlation_id,
        _token_session_id, 
        _token_associate_id, 
        object_id, 
        time_stamp, 
        label, 
        client_id, 
        category, 
        details_caption 
    from {ml_catalog}.{ml_search_db}.ml_search_click 
    where request_correlation_id is not null and lower(request_correlation_id) != 'nan'
),

search_click as (
        select search.request_correlation_id,
        search._token_client_id as client_id,
        search.label AS query,
        search.resPos,
        search.traceId,
        search.caption,
        search.subtitle,
        search.solrScore,
        search.finalScore,
        rank() over (partition by search.request_correlation_id, search.resPos, search.traceId order by click.time_stamp) as rank,
        1 AS click_count,
        click._token_session_id as click_session_id, 
        click._token_associate_id as click_associate_id, 
        click.object_id as click_object_id, 
        click.time_stamp as click_time_stamp, 
        click.label as click_label, 
        click.client_id as click_client_id, 
        click.category as click_category, 
        click.details_caption as click_details_caption
    from {ml_catalog}.{ml_search_db}.ml_search_action search
    inner join click
    on search.request_correlation_id = click.request_correlation_id
    and search._id = click.object_id
),

click_aggregation AS (
    SELECT query,
           caption,
           subtitle,
           sum(click_count) AS total_clicks,
           DENSE_RANK() OVER (PARTITION BY query ORDER BY sum(click_count) DESC) AS action_rank
    FROM search_click
    WHERE rank = 1
    GROUP BY query, caption, subtitle
),

ranked_clicks AS (
    SELECT query,
           caption,
           subtitle,
           total_clicks,
           action_rank,
           max(action_rank) over (PARTITION BY query) AS max_action_rank
    FROM click_aggregation
)

SELECT query,
       caption,
       subtitle,
       total_clicks,
       action_rank,
       max_action_rank
FROM ranked_clicks
WHERE max_action_rank >= 3
AND action_rank <= 3
ORDER BY query, action_rank
""")

In [0]:
display(df_action)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-4886578537347935>, line 1[0m
[0;32m----> 1[0m display(df_action)

[0;31mNameError[0m: name 'df_action' is not defined