In [1]:
# Imports
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import re
import numpy as np
import helpers # this is a Joe G. created helper file of functions
from bertopic import BERTopic

In [3]:
# Params / Files to change 
input_file = '../data/cleaned_data/USAJobs.csv' # change to whatever file/filepath you are using
output_file = '../data/results/usa_jobs/topic_match/batch1.csv' # change to your outpath
model_file = '../data/results/usa_jobs/topic_match/batch_model1' # change to save model path (** NOTE no file ext!!! **)
desc_column = 'Duties'
start_idx =  # file row to start at
end_idx = 1000 # file row to end at
#how many jobs do you want to search / score against? make start_idx -1 if you want to use entire file

In [4]:
# model name for BERTopic 
# TODO: re-use saved embedding files?
model_name = 'sentence-transformers/paraphrase-distilroberta-base-v2'
model = SentenceTransformer(model_name)

In [5]:
# file for our cyber baseline
baseline_file = '../data/cleaned_data/KUKSAT_Baseline.csv'
ksa_col = 'KUKSAT'

In [6]:
# read baselne_file and convert to df
df_base = pd.read_csv(baseline_file, encoding='ISO-8859-1')
ksas = df_base[ksa_col].tolist()

In [7]:
# how many ksas
len(df_base)

288

In [8]:
# read input file and covert to df
# covert target description column to list
df = pd.read_csv(input_file)
if start_idx >= 0 and start_idx != end_idx:
    jobs = df[start_idx:end_idx]
else:
    jobs = df.copy()
jobs = jobs[desc_column].tolist()

In [9]:
# how many jobs
len(jobs)

1000

In [10]:
# apply text cleanup functions to jobs and ksa base list
jobs = helpers.remove_stopwords(jobs)
ksas = helpers.remove_stopwords(ksas)

jobs = helpers.cleanup_text(jobs)
ksas = helpers.cleanup_text(ksas)

In [27]:
# initalize model, make bigram, fit to job sents
topic_model = BERTopic(embedding_model=model, n_gram_range=(1, 2))

topics, probs = topic_model.fit_transform(jobs)

In [28]:
# view topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,320,-1_security_information_systems_it
1,0,91,0_information_security_systems_infosec
2,1,70,1_information_security_program_comsec
3,2,56,2_cyberspace_operations_training_mission
4,3,55,3_ia_information_security_systems
5,4,41,4_information_security_systems_information sys...
6,5,34,5_security_agency_data_systems
7,6,33,6_network_software_systems_hardware
8,7,31,7_cyber_incident_forensic_evidence
9,8,28,8_cyber_army_operations_dod


In [29]:
# saves topic model
topic_model.save(model_file)

In [30]:
# Try using each KSA and compare see the top 10 topics that fit
# If matches (x >= 0.6), similar matches ( 0.6< x >0.4)
all_matches = []
all_similar =[]
for idx, ksa in enumerate(ksas):
    matched=[]
    similar=[]
    topics, similarity = topic_model.find_topics(ksa,top_n=10)

    for idx2, score in enumerate(similarity):
        temp={}
        if score >= 0.6:
            temp['ksa_idx'] = idx
            temp['ksa_text'] = ksas[idx]
            temp['topic_idx'] = idx2
            temp['sim_score'] = score
            temp['topic_info'] = topic_model.get_topic(idx2)
            matched.append(temp)
        if score <0.6 and score > 0.4:
            temp['ksa_idx'] = idx
            temp['ksa_text'] = ksas[idx]
            temp['topic_idx'] = idx2
            temp['sim_score'] = score
            temp['topic_info'] = topic_model.get_topic(idx2)
            similar.append(temp)
    all_matches.append(matched)
    all_similar.append(similar)

In [31]:
# calculates aggregate scores for matching, similar, missing
# uses total topics
ksa_agg_matches = []
ksa_agg_similar = []
ksa_agg_missing = []
num_topics = len(topic_model.get_topic_info()) -1
for idx, val in enumerate(ksas):
    matched_score = len(all_matches[idx])/num_topics
    similar_score = len(all_similar[idx])/num_topics
    missing_score = 1 -(matched_score + similar_score)
    ksa_agg_matches.append(matched_score)
    ksa_agg_similar.append(similar_score)
    ksa_agg_missing.append(missing_score)

In [32]:
# dataframes all results
final_df = pd.DataFrame({
    'ksa': ksas,
    'matches': all_matches,
    'similar': all_similar,
    'matched_score': ksa_agg_matches,
    'similar_score': ksa_agg_similar,
    'missing_score': ksa_agg_missing
})

In [33]:
# save to output_file
final_df.to_csv(output_file, index=False)

In [34]:
# For local visual / spot check sorts by matched, similar, missing score
final_df.sort_values(by=['matched_score', 'similar_score', 'missing_score'], ascending=False)

Unnamed: 0,ksa,matches,similar,matched_score,similar_score,missing_score
12,"Security Mechanisms e.g., Identification Authe...","[{'ksa_idx': 12, 'ksa_text': 'Security Mechani...","[{'ksa_idx': 12, 'ksa_text': 'Security Mechani...",0.043478,0.391304,0.565217
80,Fundamental security design principles applied OS,"[{'ksa_idx': 80, 'ksa_text': 'Fundamental secu...","[{'ksa_idx': 80, 'ksa_text': 'Fundamental secu...",0.043478,0.391304,0.565217
222,WITHDRAWN Skill assessing robustness security ...,"[{'ksa_idx': 222, 'ksa_text': 'WITHDRAWN Skill...","[{'ksa_idx': 222, 'ksa_text': 'WITHDRAWN Skill...",0.043478,0.391304,0.565217
226,Skill evaluating adequacy security designs.,"[{'ksa_idx': 226, 'ksa_text': 'Skill evaluatin...","[{'ksa_idx': 226, 'ksa_text': 'Skill evaluatin...",0.043478,0.391304,0.565217
278,Analyze report system security posture trends.,"[{'ksa_idx': 278, 'ksa_text': 'Analyze report ...","[{'ksa_idx': 278, 'ksa_text': 'Analyze report ...",0.043478,0.391304,0.565217
...,...,...,...,...,...,...
272,Identify network mapping operating system OS f...,[],[],0.000000,0.000000,1.000000
274,Coordinate intelligence analysts correlate thr...,[],[],0.000000,0.000000,1.000000
276,Write publish action reviews.,[],[],0.000000,0.000000,1.000000
279,Assess adequate access controls based principl...,[],[],0.000000,0.000000,1.000000


In [35]:
len(final_df[final_df['missing_score'] !=1])

176

In [36]:
# topic_model=BERTopic.load('path_to_model', embedding_model=model)