In [74]:
# Imports
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import re
import numpy as np
import helpers # this is a Joe G. created helper file of functions
from bertopic import BERTopic

In [87]:
# Params / Files to change 
input_file = '../data/cleaned_data/USAJobs.csv' # change to whatever file/filepath you are using
output_file = '../data/results/usa_jobs/topic_match/demo.csv' # change to your outpath
model_file = '../data/results/usa_jobs/topic_match/demo' # change to save model path (** NOTE no file ext!!! **)
desc_column = 'Duties'
start_idx = 0 # file row to start at
end_idx = 100 # file row to end at
#how many jobs do you want to search / score against? make start_idx -1 if you want to use entire file

In [88]:
# model name for BERTopic 
# TODO: re-use saved embedding files?
model_name = 'sentence-transformers/paraphrase-distilroberta-base-v2'
model = SentenceTransformer(model_name)

In [89]:
# file for our cyber baseline
baseline_file = '../data/cleaned_data/KUKSAT_Baseline.csv'
ksa_col = 'KUKSAT'

In [90]:
# read baselne_file and convert to df
df_base = pd.read_csv(baseline_file, encoding='ISO-8859-1')
ksas = df_base[ksa_col].tolist()

In [91]:
# how many ksas
len(df_base)

288

In [92]:
# read input file and covert to df
# covert target description column to list
df = pd.read_csv(input_file)
if start_idx >= 0 and start_idx != end_idx:
    jobs = df[start_idx:end_idx]
else:
    jobs = df.copy()
jobs = jobs[desc_column].tolist()

In [93]:
# how many jobs
len(jobs)

100

In [94]:
# apply text cleanup functions to jobs and ksa base list
jobs = helpers.remove_stopwords(jobs)
ksas = helpers.remove_stopwords(ksas)

jobs = helpers.cleanup_text(jobs)
ksas = helpers.cleanup_text(ksas)

In [95]:
# initalize model, make bigram, fit to job sents
topic_model = BERTopic(embedding_model=model, n_gram_range=(1, 2))

topics, probs = topic_model.fit_transform(jobs)

In [96]:
# view topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,49,-1_security_systems_information_policies
1,0,29,0_security_information_it_systems
2,1,12,1_cyber_army_network_security
3,2,10,2_information_security_state_duties


In [25]:
# saves topic model
topic_model.save(model_file)

In [97]:
# Try using each KSA and compare see the top 10 topics that fit
# If matches (x >= 0.6), similar matches ( 0.6< x >0.4)
all_matches = []
all_similar =[]
for idx, ksa in enumerate(ksas):
    matched=[]
    similar=[]
    topics, similarity = topic_model.find_topics(ksa,top_n=10)

    for idx2, score in enumerate(similarity):
        temp={}
        if score >= 0.6:
            temp['ksa_idx'] = idx
            temp['ksa_text'] = ksas[idx]
            temp['topic_idx'] = idx2
            temp['sim_score'] = score
            temp['topic_info'] = topic_model.get_topic(idx2)
            matched.append(temp)
        if score <0.6 and score > 0.4:
            temp['ksa_idx'] = idx
            temp['ksa_text'] = ksas[idx]
            temp['topic_idx'] = idx2
            temp['sim_score'] = score
            temp['topic_info'] = topic_model.get_topic(idx2)
            similar.append(temp)
    all_matches.append(matched)
    all_similar.append(similar)

In [98]:
# calculates aggregate scores for matching, similar, missing
# uses total topics
ksa_agg_matches = []
ksa_agg_similar = []
ksa_agg_missing = []
num_topics = len(topic_model.get_topic_info()) -1
for idx, val in enumerate(ksas):
    matched_score = len(all_matches[idx])/num_topics
    similar_score = len(all_similar[idx])/num_topics
    missing_score = 1 -(matched_score + similar_score)
    ksa_agg_matches.append(matched_score)
    ksa_agg_similar.append(similar_score)
    ksa_agg_missing.append(missing_score)

In [99]:
# dataframes all results
final_df = pd.DataFrame({
    'ksa': ksas,
    'matches': all_matches,
    'similar': all_similar,
    'matched_score': ksa_agg_matches,
    'similar_score': ksa_agg_similar,
    'missing_score': ksa_agg_missing
})

In [None]:
# save to output_file
final_df.to_csv(output_file, index=False)

In [100]:
# For local visual / spot check sorts by matched, similar, missing score
final_df.sort_values(by=['matched_score', 'similar_score', 'missing_score'], ascending=False)

Unnamed: 0,ksa,matches,similar,matched_score,similar_score,missing_score
4,Security Life Cycle,[],"[{'ksa_idx': 4, 'ksa_text': 'Security Life Cyc...",0.0,1.333333,-0.333333
9,"Confidentiality, Integrity, Availability, Acce...",[],"[{'ksa_idx': 9, 'ksa_text': 'Confidentiality, ...",0.0,1.333333,-0.333333
12,"Security Mechanisms e.g., Identification Authe...",[],"[{'ksa_idx': 12, 'ksa_text': 'Security Mechani...",0.0,1.333333,-0.333333
80,Fundamental security design principles applied OS,[],"[{'ksa_idx': 80, 'ksa_text': 'Fundamental secu...",0.0,1.333333,-0.333333
108,Security program addresses a. Physical Securit...,[],"[{'ksa_idx': 108, 'ksa_text': 'Security progra...",0.0,1.333333,-0.333333
...,...,...,...,...,...,...
279,Assess adequate access controls based principl...,[],[],0.0,0.000000,1.000000
282,Coordinate incident response functions.,[],[],0.0,0.000000,1.000000
284,Work stakeholders resolve computer security in...,[],[],0.0,0.000000,1.000000
285,"Provide advice input Disaster Recovery, Contin...",[],[],0.0,0.000000,1.000000


In [101]:
len(final_df[final_df['missing_score'] !=1])

112

In [102]:
len(final_df[final_df['missing_score'] !=1]) / len(ksas)

0.3888888888888889

In [None]:
# USA Jobs
# started 8:25pm
# ended at 8:46pm

In [None]:
# NLX 
# started 9:08 pm

In [52]:
# topic_model=BERTopic.load('path_to_model', embedding_model=model)

In [60]:
load_model = 'C:/Users/Joe/Desktop/usa_jobs_topic/all'

In [105]:
load_results  = 'C:/Users/Joe/Desktop/usa_jobs_topic/all.csv'

In [115]:
topic_model=BERTopic.load(load_model, embedding_model=model)

In [116]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,3848,-1_security_information_systems_it
1,0,592,0_you_you develop_you provide_requirements you
2,1,318,1_ia_assurance ia_information assurance_assurance
3,2,265,2_cyberspace_cyberspace operations_operations_...
4,3,105,3_issm_manager issm_security manager_manager
...,...,...,...
267,272,11,272_outputs tested_user requirements_proposals...
265,269,11,269_brigade cyber_cyber security_brigade_7th
275,274,10,274_supervises civilian_difficulty_selective_a...
276,275,10,275_ustranscom_ustranscom cyberspace_administe...


In [117]:
topic_model.get_topic(3)

[('issm', 0.021889490721834857),
 ('manager issm', 0.01900340398098318),
 ('security manager', 0.016641567454815237),
 ('manager', 0.012416893399358326),
 ('serves senior', 0.01225885496949414),
 ('serves', 0.010896975062448345),
 ('information system', 0.010375797186894166),
 ('system security', 0.009720307751888473),
 ('senior', 0.009222221322456044),
 ('cybersecurity', 0.007479209511776404)]

In [106]:
df = pd.read_csv(load_results)

In [107]:
df.sort_values(by=['matched_score', 'similar_score', 'missing_score'], ascending=False)

Unnamed: 0,ksa,matches,similar,matched_score,similar_score,missing_score
145,Knowledge database systems.,"[{'ksa_idx': 145, 'ksa_text': 'Knowledge datab...","[{'ksa_idx': 145, 'ksa_text': 'Knowledge datab...",0.01444,0.021661,0.963899
36,"Networks Internet, LANs, wireless","[{'ksa_idx': 36, 'ksa_text': 'Networks Interne...","[{'ksa_idx': 36, 'ksa_text': 'Networks Interne...",0.00722,0.028881,0.963899
9,"Confidentiality, Integrity, Availability, Acce...","[{'ksa_idx': 9, 'ksa_text': 'Confidentiality, ...","[{'ksa_idx': 9, 'ksa_text': 'Confidentiality, ...",0.00361,0.032491,0.963899
80,Fundamental security design principles applied OS,"[{'ksa_idx': 80, 'ksa_text': 'Fundamental secu...","[{'ksa_idx': 80, 'ksa_text': 'Fundamental secu...",0.00361,0.032491,0.963899
134,"Knowledge laws, regulations, policies, ethics ...","[{'ksa_idx': 134, 'ksa_text': 'Knowledge laws,...","[{'ksa_idx': 134, 'ksa_text': 'Knowledge laws,...",0.00361,0.032491,0.963899
...,...,...,...,...,...,...
204,Knowledge organization's information classific...,[],[],0.00000,0.000000,1.000000
208,Knowledge packet level analysis using appropri...,[],[],0.00000,0.000000,1.000000
230,"Skill use social engineering techniques. e.g.,...",[],[],0.00000,0.000000,1.000000
263,Receive analyze network alerts various sources...,[],[],0.00000,0.000000,1.000000


In [108]:
len(df[df['missing_score'] != 1.0])/len(ksas)

0.8715277777777778

In [109]:
df['matched_score'].max()

0.0144404332129963

In [112]:
df['matches'][145]

"[{'ksa_idx': 145, 'ksa_text': 'Knowledge database systems.', 'topic_idx': 0, 'sim_score': 0.7671856711175069, 'topic_info': [('you', 0.023201699648655837), ('you develop', 0.008660455047162487), ('you provide', 0.006548816790415824), ('requirements you', 0.005896191885981673), ('systems you', 0.005375207390411087), ('develop', 0.0048641500661284354), ('you ensure', 0.004682562485657709), ('you perform', 0.004653650328538652), ('you manage', 0.0045941412848700845), ('maintain', 0.004458890229532921)]}, {'ksa_idx': 145, 'ksa_text': 'Knowledge database systems.', 'topic_idx': 1, 'sim_score': 0.7252624250274435, 'topic_info': [('ia', 0.024086324428860956), ('assurance ia', 0.012687899087238386), ('information assurance', 0.00951916784908107), ('assurance', 0.008923536857248733), ('you', 0.007114781202154469), ('ia program', 0.006583393681839103), ('information', 0.0054288018972213385), ('ia security', 0.0049259310158541085), ('ia programs', 0.004069027390160126), ('technology it', 0.00400

In [118]:
topic_model.visualize_topics()

In [122]:
topic_model.visualize_heatmap()

In [119]:
topic_model.visualize_barchart()