In [2]:
import pandas as pd
import os

# for inverted index dictionary
import math
from collections import defaultdict

#for scoring candidates with tf-idf cosine
from collections import Counter
from gensim import similarities


# for fuzzy matching of company names
from rapidfuzz import process, fuzz

# for removal of role phrases
from flashtext import KeywordProcessor

In [3]:
# --TFIDF--
from gensim.corpora import Dictionary
from gensim.models import TfidfModel

# --Word2Vec--
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

# --SBERT--
from sentence_transformers import SentenceTransformer
import chromadb
## NOTE: chromadb requires java installed 

# Widget creation

## import packages

In [4]:
# Widget making
from ipywidgets import widgets, VBox, HBox, Layout
from IPython.display import display, clear_output

# General
import pandas as pd
import numpy as np
from datetime import datetime
import os


# for saving and loading data
import pickle

## import in data/models

In [5]:
current_directory = os.getcwd()
data_directory = os.path.join(current_directory, 'data')
output_directory = os.path.join(current_directory, 'output')


### additional data cleaning 

In [6]:
# Load in data
job_path = os.path.join(data_directory, 'data_cleaned_1.csv')
df = pd.read_csv(job_path)

# Drop 6 records where description is all Chinese (resulting in NA after cleaning)
df = df.dropna(subset=["processed_title+desc"])

# convert "type_clean" into list
df["type_clean"] = df["type_clean"].str.lower().str.split(", ").apply(list)


In [7]:
import re 

def normalize_text_company(text):
    text = text.lower()                                    # lowercase
    text = re.sub(r'\([^)]*\)|\[[^\]]*\]', "", text)       # remove text within parenthesis or square brackets
    text = re.sub(r'&', 'and', text)                       # convert & to and
    text = re.sub(r'malaysia|sdn|bhd|sb| s b|berhad|pte|plt|pty|group|holdings|co\.|ltd\.| ltd', '', text).strip()
    text = re.sub(r'[^A-Za-z0-9\s]', ' ', text)            # keep letters & numbers only
    text = re.sub(r'\s+', ' ', text).strip()               # normalize spaces

    if re.fullmatch(r'\d+', text):
        return ""   # clean off entire field if all only digits (12 digit numbers is the Malaysian company reg no. but no use here)
    
    return text

df['company_clean'] = df['company'].apply(normalize_text_company)
df['company_clean'] = df['company_clean'].replace('', 'nil')

In [8]:
# Create gazatteer as a list for later extracting of companies from query
companies = df['company_clean'].unique()
companies.sort()

### importing variables for models

In [9]:
# --Inverted Index--
with open('output/inverted_index.pkl', 'rb') as file:
    postings = pickle.load(file)
with open('output/vocab.pkl', 'rb') as file:
    vocab=pickle.load( file)


In [10]:
# --TFIDF--
with open('output/tfidf_model.pkl', 'rb') as file:
    model_tfidf = pickle.load(file)
with open('output/corpus_tfidf.pkl', 'rb') as file:
    corpus_tfidf=pickle.load( file)
with open('output/dct.pkl', 'rb') as file:
    dct_tfidf=pickle.load( file)


In [11]:
#--Word2Vec--
bigram = Phraser.load("output/bigram_phraser.model")
model_w2v = Word2Vec.load("output/word2vec.model")
wv = KeyedVectors.load( "output/word2vec.wordvectors", mmap='r')
dct_w2v = Dictionary.load('output/dct_filtered.dict')
tfidf_w2v = TfidfModel.load("output/tfidf_model.tfidf")
with open('output/doc_embeddings_norm.pkl', 'rb') as file:
    doc_embeddings_norm=pickle.load( file)
with open('output/inverted_index_w2v.pkl', 'rb') as file:
    postings_w2v = pickle.load(file)

In [12]:
# --BERT--
# chunk df
with open('output/chunk_df.pkl', 'rb') as file:
    chunk_df=pickle.load(file)
# database of embeddings
client=chromadb.PersistentClient(path=os.path.join(output_directory, 'sbert_database'))
collection = client.get_collection('job_posting_chunks')

In [13]:
# import skills mapping for each category 
import ast 
skills_map=pd.read_csv(os.path.join(current_directory,'dominant_skills_df_with_ESCO_mapping.csv'), converters={'Final_Skills': ast.literal_eval})
skills_map=skills_map[['Category', 'Final_Skills']]

## define widget components

In [14]:
# Unique values for filter options
state_options = sorted(df['State'].unique().tolist())
state_options.remove('Unknown')
type_options = ['Full time', 'Part time', 'Contract/Temp', 'Casual/Vacation']
category_options = sorted(df['merged_category'].unique().tolist())

# Query (Text Input)
query_input = widgets.Text(
    placeholder='Enter search query (e.g., Python developer)',
    layout=Layout(width='90%')
)

# State (Dropdown)
state_filter = widgets.Dropdown(
    options=['All'] + state_options,
    value='All',
    description='State:',
    layout=Layout(width='90%')
)

# Type  (Multi-select)
type_filter = widgets.SelectMultiple(
    options=type_options,
    value=(),
    description='Type:',
    rows=len(type_options), # show all options
    layout=Layout(width='90%')
)

# Category (Dropdown)
category_filter = widgets.Dropdown(
    options=['All'] + category_options,
    value='All',
    description='Category:',
    layout=Layout(width='90%')
)

# Model (radio buttons)
model_choice=widgets.RadioButtons(
    description='(For demo only) Search using:',
    options=['TF-IDF', 'Word2Vec', 'SBERT'],
    value='TF-IDF',
    layout=Layout(width='90%')
)

# Search Button
search_button = widgets.Button(
    description='Search',
    button_style='success',
    tooltip='Click to perform document retrieval'
)

# Reset Button
reset_button = widgets.Button(
    description='Reset filters',
    button_style='info',
    tooltip='Click to perform document retrieval'
)

# Output area to display results
output_area = widgets.Output()

## Query processing

The user's query will pass through 2 steps: company name extraction and query expansion 

### Company Extraction

Try to find company names (if any) in the query. Company extraction has very different signals and we need to wipe out the role related keywords in order to find the entity, and extra terms introduced during query expansion may dirty the company name data, so the logic directly conflicts with the query expansion logic below.

In [15]:
# M'sian company names only allow letters, numbers, &, period, hyphen, ', ()
# But the useful ones are letters, numbers, & (convert to 'and')
def light_normalize(text):
    s = text.lower()
    s = re.sub(r"[^a-z0-9\s&]", " ", s) # Keep only useful characters
    s = s.replace("&", " and ") # replace & with and, similar treatment as company gazatteer
    s = re.sub(r"\s+", " ", s).strip() # collapse multiple white space if any
    return s

In [16]:
# Check for simple event template
event_pat = re.compile(r'\b(?:at|with|for|from)\s+([a-z0-9 ]{2,})$', re.I)

def event_pat_search(text):
    m = event_pat.search(text)
    return [m.group(1)] if m else []

In [17]:
# Define role stop words (this is different from the normal stop words. e.g. procurement executive apple, 
# we want 'procurement executive' to be cleaned off because these are words not used in company names)
# Cannot clean off location words as could be part of the company name (e.g. Royal Selangor Association), except "Malaysia"

# Create keyword processor using FlashText (which is a phrase matcher)
# Throw in all the role_clean values inside since it's very comprehensive, act as role_stopwords
role_keyword_processor = KeywordProcessor(case_sensitive=False)
role_keyword_list = df['role_clean'].str.lower().str.strip().dropna().tolist()
for p in role_keyword_list:
    if p:  # avoid empty strings
        role_keyword_processor.add_keyword(p, " ") # FlashText to replace with empty space (empty string doesnt work) when these phrases found

In [18]:
from nltk.corpus import stopwords

# Still add some custom stop words which are one-word to prevent more unintended matches to company names
# Intended result: No company matches, become general search with no company name filter.
role_stopwords = {
    "intern", "internship", "role", "roles", "job", "jobs", "position", "positions", 
    "career", "careers", "vacancy", "vacancies", "lead", "senior", "junior",
    "remote", "hybrid", "office", "hiring", "malaysia", "analyst", "scientist", "executive",
    "manager", "engineer", "developer"
}

stop_words = set(stopwords.words("english"))
role_stopwords |= stop_words # Try to remove so it will cut down on the no. of N-grams formed

# N-gram sweep required for query cases with no prepositions, like "Apple jobs", "Shopee data scientist"
# Max use trigrams because majority of company names are 3 words
def ngram_sweep(text, n_max=3):
    cleaned_text = role_keyword_processor.replace_keywords(text)
    toks = [t for t in cleaned_text.split() if t not in role_stopwords]
    cands = set()
    for n in range(1, n_max+1): # extract unigram (0-1st pos), till trigram (0-4th pos). total 6 combis
        for i in range(len(toks) - n+1):
            cands.add(" ".join(toks[i:i+n]))
    return sorted(cands, key=lambda x: (-len(x.split()), -len(x))) # sort longer N-grams first

In [19]:
def extract_companies(text):
    s = light_normalize(text)
    cand_companies = list(set(event_pat_search(s) + ngram_sweep(s))) # Get all possible company names from the 2 methods
    return cand_companies

### query cleaning and expansion

In [20]:
# import packages to re-use functions used in cleaning text for database
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from nltk.wsd import lesk

'''
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
'''

def normalize_text(text):
    text = re.sub(r'<.*?>', ' ', text)                     # remove HTML tags
    text = re.sub(r'\\n|\n', ' ', text)                    # remove newlines
    text = re.sub(r'[^A-Za-z0-9\s]', ' ', text)            # keep letters & numbers only
    text = text.lower()                                    # lowercase
    text = re.sub(r'\s+', ' ', text).strip()               # normalize spaces
    return text

stop_words = set(stopwords.words("english"))
custom_stop = {"experience","responsibilities","requirements","requirement","knowledge",
    "skill","skills","advantage","preferred","strong","good","excellent",
    "degree","diploma","bachelor","graduate","title",
    "independent","self-motivated","hardworking",
    "deadline","pressure","benefits","apply","immediately", "writing","spoken","etc","others","job","work","company",
    "candidate","menu","new","plus","years","revenue","ensure", "provide", "including", "malaysia"}
stop_words |= custom_stop
stop_words |= set(stopwords.words("indonesian")) # unable to find malay stopword, hence using indonasian which is relatively close
lemmatizer = WordNetLemmatizer()

# get POS tagging
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'
    elif treebank_tag.startswith('V'):
        return 'v'
    elif treebank_tag.startswith('N'):
        return 'n'
    elif treebank_tag.startswith('R'):
        return 'r'
    else:
        return None
    


### tfidf

In [21]:
def expand_query(text):
    sentences = sent_tokenize(text)
    words= []
    pos_tags=[]
    
    for sent in sentences:
        w = word_tokenize(sent)  
        words+=w
        tags=pos_tag(w) 
        pos_tags+=tags 
                        
    expanded_queries = []
    for word, pos in zip(words, pos_tags):
        pos = get_wordnet_pos(pos[1])
        synsets = wordnet.synsets(word, pos=pos)
        if synsets:
            correct_synset = lesk(words, word, pos=pos)
            if correct_synset:
                max_similarity = 0
                most_similar_synset = None
                for synset in synsets:
                    similarity = correct_synset.path_similarity(synset)
                    if similarity and similarity > max_similarity:
                        max_similarity = similarity
                        most_similar_synset = synset
                if most_similar_synset:
                    expanded_queries+=most_similar_synset.lemma_names()
            else:
                expanded_queries+=word
        else:
            expanded_queries+=word

    tokens=[]

    #include cleaning steps used to clean original database
    for word in expanded_queries: 
        w=normalize_text(word)
        if w.isalpha() and w not in stop_words:        
            lemma = lemmatizer.lemmatize(w)           
            tokens.append(lemma)

    return tokens

### Word2Vec

In [22]:
# Function to compute a single doc vector
# doc_tokens is a list of words (aka tokens) of a document, we process 1 document at a time

def tfidf_weighted_doc_vector(doc_tokens, wv, dictionary, tfidf_model):
    bow = dictionary.doc2bow(doc_tokens) # Use the passed in TF-IDF dictionary to convert the document's tokens into BOW representation
    tfidf_weights = dict(tfidf_model[bow])  # Convert list of tuples to dict for fast lookup {token_id: weight}

    # Init the output vector to be of same dimension as wv vector size, with initial value is 0
    weighted_sum = np.zeros(wv.vector_size, dtype=np.float32)
    weight_total = 0.0 # This is just a validation checker

    # iterate over unique tokens in the doc (compiled from the tfidf_weights)
    for token_id, weight in tfidf_weights.items():
        token = dictionary[token_id] # get the word based on its token_id
        if token in wv: # Find the word in my lightweight vectors, wv
            weighted_sum += weight * wv[token] # Get the word embedding for this word, weighted by importance from TF-IDF model
            weight_total += weight

    if weight_total > 0:
        return (weighted_sum / weight_total).astype(np.float32) # Compute weighted average
    else: # if none of the tokens are in the W2V model's vocab, return zero vector (which is orthogonal and will result in 0 for any cosine similarity) 
        return np.zeros(wv.vector_size, dtype=np.float32)

In [23]:
def _l2_normalize_vec(v, eps=1e-12):
    n = np.linalg.norm(v)
    if n < eps:
        return v  # zero vector stays zero early return, no need divide by zero (cause error)
    return v / n

In [24]:
# Modified `expand_query()` method to add bigram transformation for W2V
def expand_query_w2v(text):
    sentences = sent_tokenize(text)
    words = []
    pos_tags = []
    
    for sent in sentences:
        w = word_tokenize(sent)  
        words += w
        tags = pos_tag(w) 
        pos_tags += tags 
                        
    expanded_queries = []
    for word, pos in zip(words, pos_tags):
        pos = get_wordnet_pos(pos[1])
        synsets = wordnet.synsets(word, pos=pos)
        if synsets:
            correct_synset = lesk(words, word, pos=pos)
            if correct_synset:
                max_similarity = 0
                most_similar_synset = None
                for synset in synsets:
                    similarity = correct_synset.path_similarity(synset)
                    if similarity and similarity > max_similarity:
                        max_similarity = similarity
                        most_similar_synset = synset
                if most_similar_synset:
                    expanded_queries += most_similar_synset.lemma_names()
            else:
                expanded_queries.append(word)
        else:
            expanded_queries.append(word)

    tokens=[]

    # include cleaning steps used to clean original database
    for word in expanded_queries: 
        w=normalize_text(word)
        if w.isalpha() and w not in stop_words:        
            lemma = lemmatizer.lemmatize(w)           
            tokens.append(lemma)

    tokens = bigram[tokens] # Apply bigram model to transform query into same representation as corpus

    return tokens

### SBERT

In [25]:

# Load model optimized for sentence/text embedding
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

def normalize_text_bert(text):
    text = re.sub(r'<.*?>', ' ', text)                     # remove HTML tags
    text = re.sub(r'([^A-Za-z0-9\s])', r' \1 ', text)        # separate punctuation from words/numbers (e.g. executive/assistant -> executive / assistant)
    text = re.sub(r'[ \t]+', ' ', text)               # normalize horizontal spaces
    return text

def process_query(text, model=bert_model):
    text = normalize_text_bert(text)
    query_embedding= model.encode(text)
    return query_embedding

## Filter out candidate documents

### Check fixed filters

In [26]:
# list all values in each of the categorical fields we are using for filter in Series format
state_arr = df["State"].astype(str).values
type_arr = df["type_clean"].values
merged_category_arr = df["merged_category"].astype(str).values

N = len(df)

In [27]:
# Assume data passed in already has validity checks on Front End
def facet_mask(state='All', job_type=[], merged_category='All'):
    m = np.ones(N, dtype=bool) # Initialise an array m of all True first, assume no filters added
    if state !="All": # check if a filter was used by the user, if there is a value passed in      
        m &= (state_arr == str(state)) # check docs that fulfil the user's passed in value, reassign back to mask m
    if job_type != []:
        if isinstance(job_type, list) and len(job_type) > 0: 
            # normalize case once
            sel = [s.lower() for s in job_type]
            m &= np.fromiter(
                (any((t.lower() in sel) for t in type_arr[i]) for i in range(N)),
                dtype=bool,
                count=N
            )
    if merged_category!='All':    
        m &= (merged_category_arr == str(merged_category))
    return m

### Filter if company name is in query

In [28]:
# Special helper function to fuzzy match candidate company names to the company gazatteer (companies)
def best_company(cand_companies, companies, cutoff=80):
    matches = [] # keep track of all successful fuzzy matches
    for cand in cand_companies: # feed all the possible candidates 
        if not cand:
            continue
        # cand=r'\b'+cand+r'\b'
        hits = process.extract( 
            cand,
            companies,
            scorer=fuzz.token_set_ratio
        )
        
        # keep only those >= cutoff
        for name, score, _ in hits:
            if score >= cutoff:
                # Append entire info (for troubleshooting code)
                # matches.append({
                #     "cand": cand,
                #     "matched_company": name,
                #     "score": float(score)
                # })

                matches.append(name)
            
    return matches # Note: may be empty list

### TFIDF

In [29]:
def candidate_docs_tfidf(query, state='All', job_type=[], merged_category='All'):
    keywords = expand_query(query)
    cand_companies = extract_companies(query)

    # 1) Apply facet mask first to quickly reduce no. of documents to do keyword match on
    m = facet_mask(state=state, job_type=job_type, merged_category=merged_category)
    allowed_idx = np.flatnonzero(m) # returns indices where facet_mask is True
    allowed_set = set(allowed_idx) # Reduced candidate set for later

    # 2) Company filtering inside allowed_set. Strategy is to intersect allowed_set with company_set
    company_matches = best_company(cand_companies, companies, cutoff=80)
    if company_matches: # if there are results proceed to filter
        company_set = set(df.index[df['company_clean'].isin(company_matches)])
        allowed_set_intersect = allowed_set.intersection(company_set)
        # Only if there is intersection then we use the intersected set, else continue to use allowed_set
        if allowed_set_intersect:
            allowed_set = allowed_set_intersect  # use intersection

    # 3) Collect keyword candidates using inverted index, only keep docs in allowed_set
    cand = set()
    if keywords:
        for w in keywords:
            tid = vocab.get(w) # check if word in my vocab dictionary
            if tid is None: # Not found, ignore
                continue
            for d, _ in postings[tid]: # if found, get the document id
                if d in allowed_set:         # check if this doc id was in the reduced candidate set already
                    cand.add(d) # If it is, add to the set() called cand
    else:
        # No keywords: candidates are all postings based on categorical filters. 
        cand = set(allowed_idx)

    return np.array(sorted(cand), dtype=int)

### Word2Vec

In [30]:
def candidate_docs_w2v(query, state='All', job_type=[], merged_category='All'):
    keywords = expand_query_w2v(query)
    cand_companies = extract_companies(query)

    # 1) Apply facet mask first to quickly reduce no. of documents to do keyword match on
    m = facet_mask(state=state, job_type=job_type, merged_category=merged_category)
    allowed_idx = np.flatnonzero(m) # returns indices where facet_mask is True
    allowed_set = set(allowed_idx) # Reduced candidate set for later

    # 2) Company filtering inside allowed_set. Strategy is to intersect allowed_set with company_set
    company_matches = best_company(cand_companies, companies, cutoff=80)
    if company_matches: # if there are results proceed to filter
        company_set = set(df.index[df['company_clean'].isin(company_matches)])
        allowed_set_intersect = allowed_set.intersection(company_set)
        # Only if there is intersection then we use the intersected set, else continue to use allowed_set
        if allowed_set_intersect:
            allowed_set = allowed_set_intersect  # use intersection
    
    # 3) Collect keyword candidates using inverted index, only keep docs in allowed_set
    cand = set()
    if keywords:
        for w in keywords:
            tid = dct_w2v.token2id.get(w) # check if word in the dictionary
            if tid is None: # Not found, ignore
                continue
            for d in postings_w2v[tid]: # if found, get the document id
                if d in allowed_set:         # check if this doc id was in the reduced candidate set already
                    cand.add(d) # If it is, add to the set() called cand
    else:
        # No keywords: candidates are all postings based on categorical filters. 
        cand = set(allowed_idx)

    return np.array(sorted(cand), dtype=int)

### SBERT

unlike above, returns the rows from chunk_df which are part of the candidate docs

In [31]:
def candidate_docs_sbert(query, state=None, job_type=None, merged_category=None):
    keywords = expand_query(query)
    cand_companies = extract_companies(query)

    # 1) Apply facet mask first to quickly reduce no. of documents to do keyword match on
    m = facet_mask(state=state, job_type=job_type, merged_category=merged_category)
    allowed_idx = np.flatnonzero(m) # returns indices where facet_mask is True
    allowed_set = set(allowed_idx) # Reduced candidate set for later

    # 2) Company filtering inside allowed_set. Strategy is to intersect allowed_set with company_set
    company_matches = best_company(cand_companies, companies, cutoff=80)
    if company_matches: # if there are results proceed to filter
        company_set = set(df.index[df['company_clean'].isin(company_matches)])
        allowed_set_intersect = allowed_set.intersection(company_set)
        # Only if there is intersection then we use the intersected set, else continue to use allowed_set
        if allowed_set_intersect:
            allowed_set = allowed_set_intersect  # use intersection
            
    # 3) Collect keyword candidates using inverted index, only keep docs in allowed_set
    cand = set()
    if keywords:
        for w in keywords:
            tid = vocab.get(w) # check if word in my vocab dictionary
            if tid is None: # Not found, ignore
                continue
            for d, _ in postings[tid]: # if found, get the document id
                if d in allowed_set:         # check if this doc id was in the reduced candidate set already
                    cand.add(d) # If it is, add to the set() called cand
    else:
        # No keywords: candidates are all postings based on categorical filters. 
        cand = set(allowed_idx)
        
    cand = np.array(sorted(cand), dtype=int)
    if len(cand) == 0:
        return None

    cand_ids=df.iloc[cand]['job_id'].tolist() # retrieve job ids of candidate documents
    cand_chunk_ids=chunk_df[chunk_df['job_id_chunk'].isin(cand_ids)].index.tolist() # get indexes of chunks from candidate documents
    return cand_chunk_ids


## rank documents

In [32]:
from collections import Counter
from gensim import similarities

def rank_tfidf_cosine(query, k=10, state='All', job_type=[], merged_category='All'):
    # Fetch the index positions of the candidate documents
    cand_idx = candidate_docs_tfidf(query, state=state, job_type=job_type, merged_category=merged_category)
    if len(cand_idx) == 0:
        return df.iloc[[]] # return nothing

    # Convert the query into BoW then TF-IDF (using existing Dictionary + TfidfModel)
    q_tokens = expand_query(query)
    q_bow = dct_tfidf.doc2bow(q_tokens)          # unseen tokens are ignored by doc2bow
    q_tfidf = model_tfidf[q_bow]                  # list[(term_id, weight)]

    # Build a similarity index over ONLY the candidates
    cand_corpus_tfidf = [corpus_tfidf[i] for i in cand_idx] # Fetch the candidate TF-IDFs into a list
    index = similarities.SparseMatrixSimilarity(
        cand_corpus_tfidf, # only candidate TF-IDFs
        num_features=len(dct_tfidf)   # use dictionary size
    )

    # Compute cosine similarities of query vs candidate docs
    sims = index[q_tfidf] # returns a list of cosine similarity scores, 0 indexed
    doc_scores = list(zip(cand_idx, sims)) # zip candidate index to the sim score
    doc_scores.sort(key=lambda x: x[1], reverse=True) # sort by sim score, desc order
    topk = doc_scores[:k] # take top k scores 
    top_doc_idx = [doc_id for doc_id, _ in topk] # Extract the candidate index and similarity score separately
    top_scores  = [score  for _, score in topk]

    # fetch original doc from df
    out = df.iloc[top_doc_idx].copy()
    out['score'] = top_scores # Append score for reference
    out = out.sort_values(by='score', ascending=False)
    
    return out

In [33]:
def rank_embed_cosine(query, k=10, state='All', job_type=[], merged_category='All'):
    # Narrow to candidate docs
    cand_idx = candidate_docs_w2v(query, state=state, job_type=job_type, merged_category=merged_category)
    if len(cand_idx) == 0:
        return df.iloc[[]]

    # Expand query and build query embedding (using TF-IDF–weighted Word2Vec)
    q_tokens = expand_query_w2v(query)
    q_vec = tfidf_weighted_doc_vector(q_tokens, wv, dct_w2v, tfidf_w2v)

    # Guard clause: if query embedding is zero, we can’t score meaningfully, return no results
    if np.all(q_vec == 0):
        return df.iloc[[]]

    # Compute Cosine similarity vs candidates (using normalised document embeddings)
    cand_mat = doc_embeddings_norm[cand_idx] # Filter the corpus matrix previously built for only candidates
    q_unit = _l2_normalize_vec(q_vec) # Normalise the query embedding also for equal length comparison

    sims = cand_mat @ q_unit # Can just take dot product since both vectors already normalised (i.e. the denominator of the cosine similarity formula alr done)

    # Get Top-k results by sorting in descending order of sims values
    n = sims.shape[0] # Take no. rows of sims
    if k < n:
        topk_idx = np.argpartition(-sims, k-1)[:k]
        order = topk_idx[np.argsort(-sims[topk_idx])]
    else:
        order = np.argsort(-sims)

    top_local = order
    top_scores = sims[top_local]
    top_doc_idx = cand_idx[top_local] # map back to global row indices

    # Structure output
    out = df.iloc[top_doc_idx].copy()
    out["score"] = top_scores
    out = out.sort_values("score", ascending=False)
    return out

In [34]:
from sentence_transformers import util

def rank_BERT_cosine(query, k=10, state='All', job_type=[], merged_category='All', model=bert_model):
    # Fetch the ids of candidate documents
    c_ids = candidate_docs_sbert(query, state=state, job_type=job_type, merged_category=merged_category)
    c_ids=[str(x) for x in c_ids]

    # Fetch the query embeddings
    q_bert = process_query(query, model=model)        

    
    # perform cosine similarity calculation between query and chunks
    results = collection.query(
        query_embeddings=q_bert,
        ids=c_ids,
        n_results=50 #50 chunks should be sufficient to get results of at least 10 different job postings 
    )        

    job_ids=results['metadatas'][0]
    job_ids=[x['job_id_chunk'] for x in job_ids]
    c_df=pd.DataFrame({
        'job_id_chunk': job_ids,
        'score': results['distances'][0] #chromadb returns a distance score, where lower values close to 0 are better
    })

    # Perform aggregation to score each job posting by its minimum cosine distance with the query
    document_scores = c_df.groupby('job_id_chunk')['score'].min().reset_index()

    # Get top 10
    ranked_documents = document_scores.sort_values(by='score', ascending=True).head(10)

    # fetch original doc
    out = pd.merge(ranked_documents, df, left_on='job_id_chunk', right_on='job_id')
    out = out.sort_values(by='score', ascending=True)
    return out

## Information retrieval

In [35]:
def perform_retrieval(query, k=10, state="All", job_type=[], merged_category='All', embed_model='TF-IDF', model=bert_model):
  if embed_model=='TF-IDF':
    results_df=rank_tfidf_cosine(query, k=k, state=state, job_type=job_type, merged_category=merged_category)

  if embed_model=='Word2Vec':
    results_df=rank_embed_cosine(query, k=k, state=state, job_type=job_type, merged_category=merged_category)

  if embed_model=='SBERT':
    results_df=rank_BERT_cosine(query, k=k, state=state, job_type=job_type, merged_category=merged_category, model=model)

  # Select and format the final output
  final_cols = ['job_title', 'company', 'descriptions', 'State', 'merged_category', 'subcategory', 'type_clean', 'salary']
    
  return results_df[final_cols]

## results styling

In [36]:
from IPython.display import display, HTML

def create_job_post_widget(row):

    # Define a custom layout for the post container
    post_layout = Layout(
        border='1px solid #dcdcdc', 
        padding='10px', 
        margin='10px 0', 
        width='98%',
        background_color='#f9f9f9'
    )

    # 1. Title and Company (Header)
    title_html = f"<h3 style='color:#007BFF; margin-bottom: 5px;'>{row.get('job_title', 'N/A')}</h3>"
    company_html = f"<b>{row.get('company', 'N/A')}</b>"
    
    header_widgets = VBox([
        widgets.HTML(value=title_html),
        widgets.HTML(value=company_html)
    ])

    # 2. Key Details (State, Category, Type, Salary)
    details_html = (
        f"<i class='fa fa-map-marker' style='color: red' aria-hidden='true'></i> <b>State:</b> {row.get('State')} &nbsp; | &nbsp; "
        f"<i class='fa fa-tags' style='color: orange' aria-hidden='true'></i> <b>Category:</b> {row.get('merged_category')} &nbsp; | &nbsp; "
        f"<i class='fa fa-clock-o' style='color: teal' aria-hidden='true'></i> <b>Type:</b> {', '.join(row.get('type_clean'))} &nbsp; | &nbsp; "
        f"<i class='fa fa-money' style='color: green' aria-hidden='true'></i> <b>Salary:</b> <span style='color:green;'>{row.get('salary') if pd.notna(row.get('salary')) else '-'}</span>"
    )
    details_widget = widgets.HTML(value=details_html)

    # 3. Description (expandable)
    full_desc = row.get('descriptions')
    desc_length = 150
    if len(full_desc) > desc_length:
               
        # ** Full Description (Hidden in Accordion)**
        # Use a Textarea to display the full description without scrollbars inside the Accordion
        full_description_widget = widgets.Textarea(
            value=full_desc,
            layout=Layout(width='auto', height='200px'),
            disabled=True # Disable editing
        )
        
        # ** Accordion for Expand/Collapse**
        description_section = widgets.Accordion(children=[full_description_widget])
        description_section.set_title(0, 'Click to Read Job Description...')
        
    else:
        # If the description is short, display it normally
        description_section = widgets.HTML(
            value=f"<p style='margin-top: 8px; font-size: 14px;'>{full_desc}</p>"
        )

    # 4. Add section for popular skills
    skills = skills_map[skills_map['Category'] == row.get('merged_category')]['Final_Skills'].values[0]
    skills_list = ' • '.join(skills) # Use a bullet point separator
    skills_html = (
        f"<div style='border-left: 5px solid #6ac5fe; padding: 5px; margin-top: 10px; background-color: #daf0ff;'>"
        f"  <p style='margin: 0; font-weight: bold; color: #007BFF; font-size: 12px;'>"
        f" <i class='fa fa-pencil' aria-hidden='true'></i> "
        f"    Popular Skills for <b>{row.get('merged_category')}</b> Roles:"
        f"  </p>"
        f"  <p style='margin-top: 2px; margin-bottom: 5px; font-size: 11px; color: #383d41; line-height:1.5'>"
        f"    {skills_list}"
        f"  </p>"
        f"</div>"
    )
    skills_section = widgets.HTML(
            value=f"<p style='margin-top: 8px; font-size: 14px;'>{skills_html}</p>"
        )
    # 5. Assemble the Post
    post_content = VBox([
        header_widgets,
        widgets.HTML(value="<hr style='border-top: 1px solid #ccc; margin: 5px 0;'>"),
        details_widget,
        description_section,
        skills_section
    ], layout=post_layout)

    return post_content

## Event Handling

In [37]:
def on_search_button_clicked(b):
    with output_area:
        clear_output(wait=True)
        print("Searching...")
        
        # Get current widget values
        query = query_input.value
        state = state_filter.value
        job_type = list(type_filter.value)
        merged_category = category_filter.value
        embed_model=model_choice.value

        # Perform retrieval
        try:
            results = perform_retrieval(query, k=10, state=state, job_type=job_type, 
                                        merged_category=merged_category, embed_model=embed_model,
                                        model=bert_model)
            clear_output(wait=True)
            
            if len(results)>0:
                print(f"\n Top {len(results)} job postings:")
                
                post_widgets = []
                for index, row in results.iterrows():
                    # Create widget for each row
                    post_widget = create_job_post_widget(row)
                    post_widgets.append(post_widget)
                
                # Display all posts in a single VBox container
                display(VBox(post_widgets))
            else:
                print(f'\n No postings found' )
              
        except Exception as e:
            clear_output(wait=True)
            print(f"An error occurred during retrieval: {e}")

# Attach the handler to the button
search_button.on_click(on_search_button_clicked)


def on_reset_filters_clicked(b):
    #reset input values
    query_input.value=''
    state_filter.value='All'
    type_filter.value=()
    category_filter.value='All'
        
        

# Attach the handler to the button
reset_button.on_click(on_reset_filters_clicked)

# Widget Interface

In [38]:
# Layout the widgets
filter_options = HBox([
    VBox([state_filter, category_filter], layout=Layout(width='40%')),
    VBox([type_filter], layout=Layout(width='40%')),
    VBox([model_choice], layout=Layout(width='30%'))
])
buttons = HBox([
    VBox([search_button]),
    VBox([reset_button])
])

interface = VBox([
    widgets.Label(value="Search for your next job", style={'font_weight': 'bold'}),
    query_input,
    widgets.Label(value="Filters", style={'font_weight': 'bold'}),
    filter_options,
    buttons,
    output_area
])

display(interface)

VBox(children=(Label(value='Search for your next job', style=LabelStyle(font_weight='bold')), Text(value='', l…

# Evaluation 

In [39]:
# uncomment below to see full values in 'description' column
# pd.set_option('display.max_colwidth', None)

# uncomment below to reset view
# pd.reset_option('all')

In [40]:
def perform_retrieval(query, k=10, state="All", job_type=[], merged_category='All', embed_model='TF-IDF', model=bert_model):
  if embed_model=='TF-IDF':
    results_df=rank_tfidf_cosine(query, k=k, state=state, job_type=job_type, merged_category=merged_category)

  if embed_model=='Word2Vec':
    results_df=rank_embed_cosine(query, k=k, state=state, job_type=job_type, merged_category=merged_category)

  if embed_model=='SBERT':
    results_df=rank_BERT_cosine(query, k=k, state=state, job_type=job_type, merged_category=merged_category, model=model)

  # Select and format the final output
  final_cols = ['job_id','job_title', 'company', 'descriptions', 'State',       'merged_category',   'type_clean']

  if results_df is None:  
    return None
  else: 
    return results_df[final_cols]

In [41]:
k=10
state="All"
job_type=[]
merged_category='All'
embed_model='SBERT'
model=bert_model

In [42]:

q1=perform_retrieval("Retail store supervisor", 
                  state="Kuala Lumpur", 
                  job_type=[], 
                  merged_category="All", 
                  embed_model=embed_model)
q1

Unnamed: 0,job_id,job_title,company,descriptions,State,merged_category,type_clean
0,74946628,RETAIL STORE SUPERVISOR (PAVILION KUALA LUMPUR),Wicked Wave Sdn Bhd,RETAIL STORE SUPERVISOR:\nJob Responsibilities...,Kuala Lumpur,Hospitality & Services,[full time]
1,74799697,"Retail Supervisor (Fashion, Menswear)",AGENSI PEKERJAAN JS STAFFING SERVICES SDN BHD,Key Duties\nMotivate sales staff to ensure the...,Kuala Lumpur,Hospitality & Services,[full time]
2,74733231,Assistant Operations Manager,SK Jewellery Sdn Bhd,RESPONSIBILITIES:\nResponsible for supervising...,Kuala Lumpur,Hospitality & Services,[full time]
3,74911355,Assistant Area Manager,NEW BALANCE ATHLETIC SHOES SDN.BHD.,We are looking for a talented candidate to und...,Kuala Lumpur,Social & Community / Other,[full time]
4,74683198,Jewelry Sales Supervisor (English & Mandarin),Yukimoto Gemstone Sdn Bhd,About Us:\nYukimoto is a premier boutique cele...,Kuala Lumpur,Hospitality & Services,[full time]
5,76502549,Retail Store Manager,MEGAH INOVATIF SDN. BHD.,JOB DESCRIPTION\nOverall care of staff and the...,Kuala Lumpur,Hospitality & Services,[full time]
6,84052063,Supervisor (KLCC & TRX),HUGO BOSS Malaysia Sdn Bhd,Supervisor is accountable for achieving assign...,Kuala Lumpur,Hospitality & Services,[full time]
7,75960258,Assistant Retail Store Manager,FW WOO SDN. BHD.,RESPONSIBILITIES:\nOrganize all store operatio...,Kuala Lumpur,Hospitality & Services,[full time]
8,82583044,Retail Associate / Store Supervisor- Mid Valle...,Evolut Holdings Pte. Ltd.,Stryv is looking for a customer-centric sales ...,Kuala Lumpur,Hospitality & Services,[full time]
9,74663559,Retail Associate,BLAIR & ASSOCIATES SDN. BHD.,Batik Boutique is an award-winning social ente...,Kuala Lumpur,Hospitality & Services,[full time]


In [43]:
q2=perform_retrieval("tuition center teacher", 
                 state="All", 
                 job_type=["Part time"], 
                 merged_category="All", 
                 embed_model=embed_model)
q2

Unnamed: 0,job_id,job_title,company,descriptions,State,merged_category,type_clean
0,75736037,Tuition Teacher,Pusat tuisyen pintar belajar gemilang,"About us\n1.Our Slogan: ""Education with a Spri...",Johor,Education,[part time]
1,75292535,SPM Tutor,AFILLA GROUP SDN BHD,About us\nAfilla Learning Centre offers a rang...,Selangor,Education,[part time]
2,74628286,Teachers,HONGIK EDU SDN.BHD,Hong Ik is an educational institution located ...,Kuala Lumpur,Education,[part time]
3,74511976,School/Learning Center Admin,Private Advertiser,School/Learning Center Sales Admin Customer Se...,Sarawak,Human Resources,[part time]
4,75092029,Teacher (French/ German) (Part-Time) (30019),Sunway Education Group,SUNWAY INTERNATIONAL SCHOOL (SIS)\nis the firs...,Johor,Education,[part time]
5,75774497,International English Teacher(Summer camp 2024),MATRIX EDUCARE SDN BHD,Program duration: 22 July - 9 August 2024\nAge...,Negeri Sembilan,Education,[part time]
6,83077423,Mandarin Teacher 中文老师,Eduwis Sdn. Bhd.,Mandarin Teacher 中文老师 @ Eduwis Education ( Ful...,Selangor,Accounting & Finance,"[contract/temp, full time, part time]"
7,83358579,Unity 3D Teacher,Private Advertiser,Unity Teacher\nThis opportunity is available t...,Sarawak,Education,[part time]
8,76540383,Part-Time Tutors (KS1-IGCSE & A-Level/IBD),COLLINZ TUTORIAL CENTRE SDN. BHD,We are looking for qualified\nPart-Time Tutors...,Selangor,Education,[part time]
9,74763925,Primary Relief Teacher,Taylor's Education Group,We are hiring Relief Teachers for immediate ap...,Kuala Lumpur,Education,[part time]


In [44]:
q3=perform_retrieval("Procurement executive", 
                 state="All", 
                 job_type=[], 
                 merged_category="Healthcare", 
                 embed_model=embed_model)
q3

Unnamed: 0,job_id,job_title,company,descriptions,State,merged_category,type_clean
0,75404448,MATERIAL MANAGEMENT MANAGER (PURCHASING),Melaka Straits Medical Centre Sdn Bhd,We embrace diversity and inclusion and welcome...,Melaka,Healthcare,[full time]
1,75898611,Sourcing and Planning Executive,UC Biosciences,Job Description:\nTo design category sourcing ...,Kuala Lumpur,Healthcare,[full time]
2,75913037,"Executive/Senior Executive, Project Management",NOVUGEN PHARMA (MALAYSIA) SDN. BHD.,About us\nNovugen is a wholly owned subsidiary...,Selangor,Healthcare,[full time]
3,74813633,Sales Executive (KL/Selangor),Alvimedica Malaysia Sdn Bhd,Summary\nThe Sales Executive role is responsib...,Kuala Lumpur,Healthcare,[full time]
4,76105453,PURCHASING EXECUTIVE / ASSISTANT,PENTAVEST HOLDINGS SDN.BHD.,PENTAVEST HOLDINGS SDN BHD\nWe are well establ...,Melaka,Healthcare,[full time]
5,83005797,Executive Provider & Vendor Management,PROTECTHEALTH CORPORATION SDN. BHD.,"What we offer\nAt ProtectHealth Corporation, w...",Selangor,Healthcare,[contract/temp]
6,74822080,"Assistant Manager, Business Operations",DKSH Malaysia Sdn Bhd,Company description:\nAbout DKSH\nDKSH's purpo...,Selangor,Healthcare,[full time]
7,74730737,Security & Transport Executive,Hospital Fatimah,The Security & Transport Executive is responsi...,Perak,Healthcare,[full time]
8,76140562,Logistic Executive,PA Recruitment (KL) Sdn Bhd,Our client is a reputable company in the globa...,Selangor,Healthcare,[full time]
9,76119478,Junior Executive Trainee - Healthcare (October...,DKSH Malaysia Sdn Bhd,Junior Executive Trainee (JET) gives entry lev...,Selangor,Healthcare,[contract/temp]


In [45]:
q4=perform_retrieval("airasia senior engineer", 
                 state="All", 
                 job_type=[], 
                 merged_category="All", 
                 embed_model=embed_model)
q4

Unnamed: 0,job_id,job_title,company,descriptions,State,merged_category,type_clean
0,74542287,Software Development Engineer Testing (SDET),AirAsia,Job Description SDET Responsibilities: Underst...,Kuala Lumpur,Engineering & Technology,[full time]
1,74553434,Senior Software Engineer,AirAsia,Job Description Why AirAsia? Are you ready to ...,Kuala Lumpur,Engineering & Technology,[full time]
2,74033719,Technology Lead,AirAsia,Job Description Why AirAsia? Are you ready to ...,Kuala Lumpur,Engineering & Technology,[full time]
3,73894618,Technical Lead,AirAsia,Job Description Why AirAsia? Are you ready to ...,Kuala Lumpur,Engineering & Technology,[full time]
4,73802993,Lead Data Engineer,AirAsia,Job Description Why AirAsia? Are you ready to ...,Kuala Lumpur,Engineering & Technology,[full time]
5,74300840,Technology Lead,AirAsia,Job Description Why AirAsia? Are you ready to ...,Kuala Lumpur,Engineering & Technology,[full time]
6,73835716,CAMO Technical Services Engineer - Cabin Interior,AirAsia,Job Description AirAsia X is seeking a dynamic...,Kuala Lumpur,Engineering & Technology,[full time]
7,74199570,Site Reliability Engineer,AirAsia,Job Description AirAsia Software Engineering T...,Kuala Lumpur,Engineering & Technology,[full time]
8,73802910,Senior Data Scientist,AirAsia,Job Description Senior Data Scientist Are you ...,Kuala Lumpur,Engineering & Technology,[full time]
9,74641893,Software Engineer II,AirAsia,Job Description AirAsia Software Engineering T...,Kuala Lumpur,Engineering & Technology,[full time]


In [46]:
q5=perform_retrieval("hr exec 2-3 yrs exp", 
                 state="All", 
                 job_type=[], 
                 merged_category="All", 
                 embed_model=embed_model)
q5

Unnamed: 0,job_id,job_title,company,descriptions,State,merged_category,type_clean
0,74505362,HR Admin Assistant [5 days | Junior],Agensi Pekerjaan The Supreme HR Advisory Sdn Bhd,"• Ulu Tiram, Johor\n• 5 days work\n• Career Pr...",Johor,Human Resources,[full time]
1,74342007,Senior HR Admin Executive [1 Year Contract],Agensi Pekerjaan The Supreme HR Advisory Sdn Bhd,• Penang Island - Malaysia\n• 5 days work\n• C...,Penang,Human Resources,[full time]
2,74272346,TENDER ADMIN,HRSB Holdings Sdn Bhd,Description\nKey Responsibilities:\n...\n1. Co...,Melaka,Administration,[full time]
3,74252472,Recruitment Consultant | HR Consultant [Junior...,Agensi Pekerjaan The Supreme HR Advisory Sdn Bhd,"• KL, Malaysia\n• 5 days work\n• Career Progre...",Kuala Lumpur,Human Resources,[full time]
4,74528393,Project Engineer,Agensi Pekerjaan The Supreme HR Advisory Sdn Bhd,Fixed Transport Allowance\nPerformance Bonus\n...,Melaka,Engineering & Technology,[full time]
5,75729444,Procurement Engineer (Mechanical),Jord Malaysia Sdn Bhd,Jord International Pty Ltd\nis an Australian b...,Selangor,Engineering & Technology,[full time]
6,74660622,Full Stack Developer,Agensi Pekerjaan The Supreme HR Advisory Sdn Bhd,13 months salary per year\nYearly Increment & ...,Selangor,Engineering & Technology,[full time]
7,74943706,Production Supervisor,SCIENTEX PACKAGING ( TELUK EMAS ) SDN. BHD.,Tasks & Responsibilities:\nMonitor daily produ...,Melaka,Manufacturing & Logistics,[full time]
8,75745116,Costing Assistant,OTL Asia Sdn Bhd,Position Overview:\nAs a Costing Assistant at ...,Penang,Accounting & Finance,[full time]
9,75085956,Purchasing Assistant,QF Hardware Trading (M) Sdn Bhd,Job Responsibilities:\nGenerate Customer Sales...,Selangor,Administration,[full time]


In [47]:
q6=perform_retrieval("logistics warehouse jobs mandarin speaking", 
                 state="All", 
                 job_type=[], 
                 merged_category="All", 
                 embed_model=embed_model)
q6

Unnamed: 0,job_id,job_title,company,descriptions,State,merged_category,type_clean
0,74661497,Sales Executive,Mandarin Club,Responsibility\nInitiate outbound calls to pot...,Kuala Lumpur,Hospitality & Services,[full time]
1,75325771,Customer Support (Technical Support - Mandarin),Concentrix,Mandarin Speaker - Customer Service Advisor (T...,Kuala Lumpur,Hospitality & Services,[full time]
2,74027375,Procurement and Sourcing Officer,Mandarin Club,Responsible:\nOverseeing and supervising all a...,Selangor,Manufacturing & Logistics,[full time]
3,73239819,SALES & MARKETING EXECUTIVE – LOGISTICS (FREIG...,AP LOGISTICS (M) SDN. BHD.,Job Description\nmeeting with clients virtuall...,Selangor,Sales & Marketing,[full time]
4,76594412,Business Manager,MumsMe Sdn Bhd,MumsMe is currently expanding and looking for ...,Johor,Sales & Marketing,[full time]
5,74038306,ASSISTANT CHIEF ENGINEER,"Mandarin Oriental, Kuala Lumpur",Position:\nASSISTANT CHIEF ENGINEER (Full time...,Kuala Lumpur,Engineering & Technology,[full time]
6,75736026,ADMIN and HR EXECUTIVE,VII STYLE HOLDING SDN. BHD.,About Us\nWe are a Food and Beverages company ...,Kuala Lumpur,Administration,[full time]
7,75111239,Marketing Executive (URGENT HIRING),KEN Holdings Berhad,Job Description\nManage website and enhance we...,Kuala Lumpur,Construction & Trades,[full time]
8,76690384,Site Supervisor (C&S),SHENZHENG DEVELOPMENT & CONSTRUCTION (MALAYSIA...,Job Requirements:\nDiploma/Degree in Civil Eng...,Negeri Sembilan,Construction & Trades,[full time]
9,75965139,Quality Assurance Officer,KARL INTERNATIONAL PTE LTD,Karl International (formerly known as JSA Glob...,Kuala Lumpur,Engineering & Technology,[full time]


In [48]:
q7=perform_retrieval("Communications major marketing roles", 
                 state="All", 
                 job_type=[], 
                 merged_category="All", 
                 embed_model=embed_model)
q7

Unnamed: 0,job_id,job_title,company,descriptions,State,merged_category,type_clean
0,76687499,Marketing Communications Professional APAC,Siemens Energy Sdn. Bhd.,Location\nMalaysia\nSelangor\nPetaling Jaya\nC...,Selangor,Sales & Marketing,[full time]
1,75590941,Senior Marketing Executive,EURATECH INDUSTRIES SDN BHD,Job Description\nAssist Sales & Marketing mana...,Negeri Sembilan,Sales & Marketing,[full time]
2,75080413,Marketing Executive (Digital Marketing),Ace Empire Development Sdn Bhd,Digital Marketing\nDevelop and execute compreh...,Johor,Sales & Marketing,[full time]
3,75573230,Aftermarket Senior Sales Executive,Agensi Pekerjaan KELLY OCG Sdn Bhd,Aftermarket Senior Sales Executive\nOur cultur...,Sarawak,Sales & Marketing,[full time]
4,76084713,Marketing Executive,DR CHONG CLINIC,Responsibilities\n:\n•\tAssist in developing a...,Kuala Lumpur,Sales & Marketing,[full time]
5,82493276,APAC Fresh Project- Global Business Solutions ...,TikTok,Responsibilities\nAbout TikTok\nTikTok is the ...,Kuala Lumpur,Sales & Marketing,[full time]
6,75529177,"Corporate Branding, S&M GM Senior Manager (Pro...",BANGSAR HEIGHTS PAVILION SDN. BHD.,• Develop comprehensive branding strategies th...,Kuala Lumpur,Construction & Trades,[full time]
7,76289798,Sale & Marketing Executive,Agartea Sdn Bhd,"Job Responsibilities\nDevelop, plan, and execu...",Kuala Lumpur,Sales & Marketing,[full time]
8,75021341,Marketing Executive,Sun Ten Pharmaceutical MFG (M) Sdn Bhd,Job Description\nResponsible for new product r...,Kuala Lumpur,Sales & Marketing,[full time]
9,76677567,MARKETING MANAGER,Palmtop Vegeoil Products Sdn. Bhd.,Company Overview:\nWe are a leading producer a...,Kuala Lumpur,Sales & Marketing,[full time]


In [49]:
q8=perform_retrieval("medical device engineer ISO 13485 certification", 
                 state="All", 
                 job_type=[], 
                 merged_category="All", 
                 embed_model=embed_model)
q8

Unnamed: 0,job_id,job_title,company,descriptions,State,merged_category,type_clean
0,74525848,Management System Auditor,CARE CERTIFICATION INTERNATIONAL (M) SDN BHD,Technical personnel requirement\nQualification...,Unknown,Engineering & Technology,[full time]
1,74583853,QA Engineer,Epsilon Medical Devices Sdn Bhd,Description\n1. Develop drawing for new produc...,Perak,Engineering & Technology,[full time]
2,74367076,Engineering Executive / Officer,CCB Medical Devices Sdn Bhd,"Competency Requirements:\nMinimum Diploma, Deg...",Penang,Engineering & Technology,[full time]
3,74324783,Accounts Executive,Andaman Medical Regulatory Affairs & Market Ac...,We Are Looking for Applicants to Join our Pena...,Penang,Accounting & Finance,[full time]
4,74942221,QA ENGINEER (CHEMICAL),HYFLEX TECHNOLOGIES SDN. BHD.,Responsibility\nAssist company to maintain in ...,Johor,Engineering & Technology,[full time]
5,74348743,Field Service Engineer (Technical Support),Boston Scientific Medical Device (Malaysia) Sd...,Purpose Statement:\nAs part of APAC Business S...,Penang,Engineering & Technology,[full time]
6,74469662,Service Engineer (Based in Johor Bahru),Abex Medical System Sdn Bhd,Description\nResponsibilities:\n- Perform prev...,Selangor,Engineering & Technology,[full time]
7,74392103,Equipment Software Engineer (Contract),Boston Scientific Medical Device (Malaysia) Sd...,Key Responsibilities\nResponsible for providin...,Unknown,Engineering & Technology,[contract/temp]
8,73739113,"Data Steward, Japan JARVIS",Boston Scientific Medical Device (Malaysia) Sd...,Additional Locations: N/A\nDiversity\nInnovat...,Kuala Lumpur,Engineering & Technology,[full time]
9,74361111,Senior Product Specialist - Structural Heart,Boston Scientific Medical Device (Malaysia) Sd...,Additional Locations: N/A\nDiversity\nInnovat...,Kuala Lumpur,Sales & Marketing,[full time]


In [50]:
q9=perform_retrieval("data scientist jobs requiring Python and SQL", 
                 state="All", 
                 job_type=[], 
                 merged_category="All", 
                 embed_model=embed_model)
q9

Unnamed: 0,job_id,job_title,company,descriptions,State,merged_category,type_clean
0,76274337,Python Developer,Payments Network Malaysia Sdn Bhd,"Key Areas of Responsibilities:\nDesign, develo...",Kuala Lumpur,Engineering & Technology,[contract/temp]
1,76249766,Data Analyst,TGV CINEMAS SDN BHD,Job Summary\nWe are seeking a Data Analyst to ...,Kuala Lumpur,Engineering & Technology,[full time]
2,74263371,Data Analyst - MRO C&SI,Huawei Technologies (Malaysia) Sdn. Bhd,Responsibilities:\nWork with stakeholders thro...,Kuala Lumpur,Engineering & Technology,[contract/temp]
3,74028112,Data Engineer,DATABRICKS TECH SDN. BHD.,"Job Purpose:\nMaintain, backup, secure, and tr...",Kuala Lumpur,Engineering & Technology,[full time]
4,73635166,Data Scientist,Star Media Group,Job Description\nJob Responsibilities\n1) Data...,Selangor,Engineering & Technology,[full time]
5,75163395,Data Analyst - Digital Marketing,TGV CINEMAS SDN BHD,Role Summary\nWe are seeking a Data Analyst to...,Kuala Lumpur,Creative & Design,[full time]
6,73185688,Sr. Executive Data Scientist,Honda Malaysia Sdn Bhd,Job Summary\nResponsible for the conceptualizi...,Selangor,Engineering & Technology,[contract/temp]
7,83327369,IT Data Analyst,SARAWAK OIL PALMS BERHAD,Qualifications & Requirements:\nPossess at lea...,Sarawak,Engineering & Technology,[full time]
8,73580492,Senior Data Scientist,Carsome Sdn Bhd,About You\nCARSOME is searching for a Senior D...,Selangor,Engineering & Technology,[full time]
9,74599074,Data Scientist,PETROHORIZON SDN. BHD.,TASK & TARGETS\nThe tasks and targets as a dat...,Selangor,Engineering & Technology,[casual/vacation]


In [51]:
q10=perform_retrieval("chief editor", 
                 state="All", 
                 job_type=[], 
                 merged_category="All", 
                 embed_model=embed_model)
q10

Unnamed: 0,job_id,job_title,company,descriptions,State,merged_category,type_clean
0,76123241,Editor (BM/English/Science/Mathematics),Alaf Sanjung Sdn Bhd,· To edit manuscripts thoroughly;\n· To plan a...,Selangor,Creative & Design,[full time]
1,75264066,"Chief Editor, Group Marketing",SEG International Bhd,Responsibilities:\nTo be responsible for conce...,Selangor,Creative & Design,[full time]
2,76084280,SENIOR EXECUTIVE (CHIEF EXECUTIVE OFFICER'S OF...,Perbadanan Bekalan Air Pulau Pinang Sdn Bhd,SENIOR EXECUTIVE (CHIEF EXECUTIVE OFFICER’S OF...,Penang,Administration,[contract/temp]
3,76677389,Book Editor (SC/Maths) - full time/freelancer,Nilam Publication Sdn Bhd,Job Responsibility :\nMust be able to perform ...,Selangor,Creative & Design,[full time]
4,83443779,Team Lead (Editorial) - Based in Singapore,Marshall Cavendish Education Pte Ltd,Role and responsibilities:\nDeliver a competit...,Kuala Lumpur,Education,[full time]
5,73920396,Chief Executive Officer (CEO),Company Confidential,The Chief Executive Officer (CEO) will be resp...,Kuala Lumpur,Management & Strategy,[full time]
6,74478866,Chief Financial Officer (CFO),Company Confidential,The Chief Financial Officer (CFO) will be resp...,Kuala Lumpur,Accounting & Finance,[full time]
7,74765954,Senior HR & Administrative Executive,Royal Institution of Surveyors Malaysia,Job Description:\nAssume management responsibi...,Selangor,Human Resources,[full time]
8,74566561,Chief Financial Officer (CFO),Company Confidential,The Chief Financial Officer (CFO) will be resp...,Kuala Lumpur,Accounting & Finance,[full time]
9,74478956,Chief Financial Officer (CFO),Company Confidential,The Chief Financial Officer (CFO) will be resp...,Kuala Lumpur,Accounting & Finance,[full time]


In [None]:
## uncomment out below to save evaluation results
# dfs_dict = {
#     'q1': q1,
#     'q2': q2,
#     'q3': q3,
#     'q4': q4,
#     'q5': q5,
#     'q6': q6,
#     'q7': q7,
#     'q8': q8,
#     'q9': q9,
#     'q10': q10,
#     }
# final_df=pd.concat(dfs_dict, keys=dfs_dict.keys())
# final_df=final_df.reset_index()
# final_df=final_df.rename(columns={'level_0': 'Query', 'level_1':'rank'})
# final_df.to_csv('SBERT_query_results.csv', index=False)