In [20]:
# Imports
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import re
import numpy as np
import helpers # this is a Joe G. created helper file of functions
from bertopic import BERTopic

In [37]:
# Params / Files to change 
input_file = '../data/cleaned_data/USAJobs.csv' # change to whatever file/filepath you are using
output_file = '../data/results/usa_jobs/similarity_match/batch2.csv' # change to your outpath
desc_column = 'Duties'
baseline_embedding_file = '../data/saved_embeddings/baseline.pickle' # Make None if you don't want to saev
sent_embedding_file = '../data/saved_embeddings/test_batch.pickle' #change to sentence embedding path
start_idx = 0 # file row to start at
end_idx = 100 # file row to end at
#how many jobs do you want to search / score against? make start_idx -1 if you want to use entire file

In [38]:
# Initialize Pre-trained model 
# downloads automatically from hugging face
model_name = 'sentence-transformers/paraphrase-distilroberta-base-v2'
model = SentenceTransformer(model_name)

In [39]:
# file for our cyber baseline
# hopefully can use embedding file later
# ? Is Knowledge_Units.csv the final baseline file? 
baseline_file = '../data/cleaned_data/KUKSAT_Baseline.csv'
ksa_col = 'KUKSAT'

In [40]:
# read baselne_file and convert to df
df_base = pd.read_csv(baseline_file, encoding='ISO-8859-1')
ksas = df_base[ksa_col].tolist()

In [41]:
# how many ksas
len(df_base)

288

In [42]:
# read input file and covert to df
# covert target description column to list
df = pd.read_csv(input_file)
if start_idx >= 0 and start_idx != end_idx:
    jobs = df[start_idx:end_idx]
else:
    jobs = df.copy()
jobs = jobs[desc_column].tolist()

In [43]:
# how many jobs
len(jobs)

100

In [44]:
# apply text cleanup functions to jobs and ksa base list
# jobs = helpers.remove_stopwords(jobs)
# ksas = helpers.remove_stopwords(ksas)

jobs = helpers.cleanup_text(jobs)
ksas = helpers.cleanup_text(ksas)

In [45]:
# split jobs to sentence level
job_sent = helpers.split_sents(jobs)
job_sent = [s for job in job_sent for s in job]

In [46]:
# no stop words in sents
topic_model = BERTopic(embedding_model=model)

topics, probs = topic_model.fit_transform(job_sent)

In [83]:
len(topic_model.get_topic_info()['Topic'].tolist())

22

In [48]:
# saves topic model
topic_model.save('../data/results/usa_jobs/similarity_match/topicModel')

In [72]:
all_matches = []
all_similar =[]
for idx, ksa in enumerate(ksas):
    matched=[]
    similar=[]
    topics, similarity = topic_model.find_topics(ksa,top_n=10)

    for idx2, score in enumerate(similarity):
        temp={}
        if score >= 0.6:
            temp['ksa_idx'] = idx
            temp['ksa_text'] = ksas[idx]
            temp['topic_idx'] = idx2
            temp['sim_score'] = score
            temp['topic_info'] = topic_model.get_topic(idx2)
            matched.append(temp)
        if score <0.6 and score > 0.4:
            temp['ksa_idx'] = idx
            temp['ksa_text'] = ksas[idx]
            temp['topic_idx'] = idx2
            temp['sim_score'] = score
            temp['topic_info'] = topic_model.get_topic(idx2)
            similar.append(temp)
    all_matches.append(matched)
    all_similar.append(similar)

In [None]:
num_topics = topic_model.topics()

In [None]:
ksa_agg_matches = []
ksa_agg_similar = []
num_topics
for idx, val in enumerate(ksas):
    matched_score = len(all_matches[idx])/

In [73]:
final_df = pd.DataFrame({
    'ksa': ksas,
    'matches': all_matches,
    'similar': all_similar
})

[[],
 [{'ksa_idx': 1,
   'ksa_text': 'Vulnerabilities and Risk management include backups and recovery',
   'topic_idx': 0,
   'sim_score': 0.46687835039694586,
   'topic_info': [('vulnerabilities', 0.08127912833734274),
    ('identify', 0.07051248532026982),
    ('risks', 0.06976112365770233),
    ('vulnerability', 0.05850702039482417),
    ('risk', 0.04521581582197221),
    ('and', 0.044031738573732396),
    ('assessments', 0.040991157701148834),
    ('systems', 0.039176227310131484),
    ('to', 0.0389712415736714),
    ('protection', 0.0370126287267103)]}],
 [{'ksa_idx': 2,
   'ksa_text': 'Common Attacks',
   'topic_idx': 0,
   'sim_score': 0.4431535197523256,
   'topic_info': [('vulnerabilities', 0.08127912833734274),
    ('identify', 0.07051248532026982),
    ('risks', 0.06976112365770233),
    ('vulnerability', 0.05850702039482417),
    ('risk', 0.04521581582197221),
    ('and', 0.044031738573732396),
    ('assessments', 0.040991157701148834),
    ('systems', 0.039176227310131484

In [74]:
topic_model.get_topic(3)

[('cyber', 0.08515743915573751),
 ('cybersecurity', 0.061543671236926764),
 ('army', 0.04530453213380502),
 ('will', 0.0415931611115692),
 ('the', 0.041398049622688886),
 ('as', 0.038721290691280184),
 ('you', 0.03583871417004505),
 ('for', 0.032958411127075475),
 ('cio', 0.03221181276091664),
 ('of', 0.028662075369281965)]

In [None]:
ksa | matched | similar 

In [None]:
match,sim = {topic_idx, info, score}

In [49]:
t2=BERTopic.load('../data/results/usa_jobs/similarity_match/topicModel', embedding_model=model)

In [50]:
t2.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,260,-1_and_security_the_of
1,0,73,0_work_mission_subordinate_of
2,1,56,1_vulnerabilities_identify_risks_vulnerability
3,2,53,2_information_unauthorized_systems_and
4,3,39,3_cyber_cybersecurity_army_will
5,4,38,4_cyberspace_tactical_operator_operators
6,5,32,5_security_and_engineering_gpo
7,6,30,6_established_comprehensive_and_ais
8,7,29,7_duties_include_but_limited
9,8,29,8_comsec_cryptographic_key_materials


In [55]:
topic_model.visualize_topics()