In [None]:
# Imports
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import re
import numpy as np
import helpers # this is a Joe G. created helper file of functions
from bertopic import BERTopic

In [None]:
# Params / Files to change 
input_file = '../data/cleaned_data/USAJobs.csv' # change to whatever file/filepath you are using
output_file = '../data/results/usa_jobs/topic_match/demo.csv' # change to your outpath
model_file = '../data/results/usa_jobs/topic_match/demo' # change to save model path (** NOTE no file ext!!! **)
desc_column = 'Duties'
start_idx = 0 # file row to start at
end_idx = 100 # file row to end at
#how many jobs do you want to search / score against? make start_idx -1 if you want to use entire file

In [None]:
# model name for BERTopic 
# TODO: re-use saved embedding files?
model_name = 'sentence-transformers/paraphrase-distilroberta-base-v2'
model = SentenceTransformer(model_name)

In [None]:
# file for our cyber baseline
baseline_file = '../data/cleaned_data/KUKSAT_Baseline.csv'
ksa_col = 'KUKSAT'

In [None]:
# read baselne_file and convert to df
df_base = pd.read_csv(baseline_file, encoding='ISO-8859-1')
ksas = df_base[ksa_col].tolist()

In [None]:
# how many ksas
len(df_base)

In [None]:
# read input file and covert to df
# covert target description column to list
df = pd.read_csv(input_file)
if start_idx >= 0 and start_idx != end_idx:
    jobs = df[start_idx:end_idx]
else:
    jobs = df.copy()
jobs = jobs[desc_column].tolist()

In [None]:
# how many jobs
len(jobs)

In [None]:
# apply text cleanup functions to jobs and ksa base list
jobs = helpers.remove_stopwords(jobs)
ksas = helpers.remove_stopwords(ksas)

jobs = helpers.cleanup_text(jobs)
ksas = helpers.cleanup_text(ksas)

In [None]:
# initalize model, make bigram, fit to job sents
topic_model = BERTopic(embedding_model=model, n_gram_range=(1, 2))

topics, probs = topic_model.fit_transform(jobs)

In [None]:
# view topics
topic_model.get_topic_info()

In [None]:
# saves topic model
topic_model.save(model_file)

In [None]:
# Try using each KSA and compare see the top 10 topics that fit
# If matches (x >= 0.6), similar matches ( 0.6< x >0.4)
all_matches = []
all_similar =[]
for idx, ksa in enumerate(ksas):
    matched=[]
    similar=[]
    topics, similarity = topic_model.find_topics(ksa,top_n=10)

    for idx2, score in enumerate(similarity):
        temp={}
        if score >= 0.6:
            temp['ksa_idx'] = idx
            temp['ksa_text'] = ksas[idx]
            temp['topic_idx'] = idx2
            temp['sim_score'] = score
            temp['topic_info'] = topic_model.get_topic(idx2)
            matched.append(temp)
        if score <0.6 and score > 0.4:
            temp['ksa_idx'] = idx
            temp['ksa_text'] = ksas[idx]
            temp['topic_idx'] = idx2
            temp['sim_score'] = score
            temp['topic_info'] = topic_model.get_topic(idx2)
            similar.append(temp)
    all_matches.append(matched)
    all_similar.append(similar)

In [None]:
# calculates aggregate scores for matching, similar, missing
# uses total topics
ksa_agg_matches = []
ksa_agg_similar = []
ksa_agg_missing = []
num_topics = len(topic_model.get_topic_info()) -1
for idx, val in enumerate(ksas):
    matched_score = len(all_matches[idx])/num_topics
    similar_score = len(all_similar[idx])/num_topics
    missing_score = 1 -(matched_score + similar_score)
    ksa_agg_matches.append(matched_score)
    ksa_agg_similar.append(similar_score)
    ksa_agg_missing.append(missing_score)

In [None]:
# dataframes all results
final_df = pd.DataFrame({
    'ksa': ksas,
    'matches': all_matches,
    'similar': all_similar,
    'matched_score': ksa_agg_matches,
    'similar_score': ksa_agg_similar,
    'missing_score': ksa_agg_missing
})

In [None]:
# save to output_file
final_df.to_csv(output_file, index=False)

In [None]:
# For local visual / spot check sorts by matched, similar, missing score
final_df.sort_values(by=['matched_score', 'similar_score', 'missing_score'], ascending=False)

In [None]:
len(final_df[final_df['missing_score'] !=1])

In [None]:
len(final_df[final_df['missing_score'] !=1]) / len(ksas)

In [None]:
# USA Jobs
# started 8:25pm
# ended at 8:46pm

In [None]:
# NLX 
# started 9:08 pm

In [None]:
# topic_model=BERTopic.load('path_to_model', embedding_model=model)

In [None]:
load_model = 'C:/Users/Joe/Desktop/usa_jobs_topic/all'

In [None]:
load_results  = 'C:/Users/Joe/Desktop/usa_jobs_topic/all.csv'

In [None]:
topic_model=BERTopic.load(load_model, embedding_model=model)

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.get_topic(3)

In [None]:
df = pd.read_csv(load_results)

In [None]:
df.sort_values(by=['matched_score', 'similar_score', 'missing_score'], ascending=False)

In [None]:
len(df[df['missing_score'] != 1.0])/len(ksas)

In [None]:
df['matched_score'].max()

In [None]:
df['matches'][145]

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.visualize_barchart()