In [50]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity as cs
import pandas as pd
import re
import numpy as np
import numba

In [51]:
# example KSA 
baseline =[
'Knowledge of computer networking concepts and protocols, and network security methodologies.',
'Knowledge of laws, regulations, policies, and ethics as they relate to cybersecurity and privacy.',
'Knowledge of specific operational impacts of cybersecurity lapses.',
'Knowledge of cyber defense and vulnerability assessment tools and their capabilities.',
'Knowledge of cryptography and cryptographic key management concepts',
'Knowledge of Security Assessment and Authorization process.',
'Knowledge of vulnerability information dissemination sources (e.g., alerts, advisories, errata, and bulletins).',
'Knowledge of cybersecurity and privacy principles and organizational requirements (relevant to confidentiality, integrity, availability, authentication, non-repudiation).',
'Knowledge of Risk Management Framework (RMF) requirements.',
'Knowledge of information technology (IT) security principles and methods (e.g., firewalls, demilitarized zones, encryption).',
'Knowledge of network security architecture concepts including topology, protocols, components, and principles (e.g., application of defense-in-depth).',
'Knowledge of security architecture concepts and enterprise architecture reference models (e.g., Zachman, Federal Enterprise Architecture [FEA]).',
'Knowledge of security models (e.g., Bell-LaPadula model, Biba integrity model, Clark-Wilson integrity model)',
'Knowledge of laws, policies, procedures, or governance relevant to cybersecurity for critical infrastructures.',
'Knowledge of embedded systems.',
'Knowledge of penetration testing principles, tools, and techniques.',
'Knowledge of controls related to the use, processing, storage, and transmission of data.',
'Knowledge of Application Security Risks (e.g. Open Web Application Security Project Top 10 list)'
]

In [52]:
baseline = [re.sub('Knowledge of ', '', base).lower() for base in baseline]

In [53]:
# import usajobs dataset and create single description column
file = './Data/Cleaned Data/USAJobs.csv'
df = pd.read_csv(file)
df['desc'] = df['Duties'] + df['Qualifications']

In [54]:
# get sample of 10 for testing
all_desc = df['desc'][0:100].tolist()

In [55]:
from nltk.corpus import stopwords

cachedStopWords = stopwords.words("english")

In [70]:
# rm stopwords
# Even though BERT is sentence based and prefers stop words for syntax/context understanding, 
no_stop_sents = []
no_stop_desc = []
for desc in all_desc:
    text = ' '.join([word for word in desc.split() if word not in cachedStopWords]).strip()
    text = re.sub('•', '', text)
    text = re.sub('-', '', text)
    no_stop_desc += text
    sents = re.split('\/n|\.', text)
    sents = [sent.strip() for sent in sents]
    no_stop_sents += sents

In [56]:
# from top2vec import Top2Vec
from bertopic import BERTopic

In [57]:
# model_name = 'sentence-transformers/all-MiniLM-L6-v2'
model_name = 'sentence-transformers/paraphrase-distilroberta-base-v2'

In [58]:
model = SentenceTransformer(model_name)

In [61]:
# cleans text and removes bullets and bad chars
all_sents = []
all_sents_vecs = []
cleaned_sents = []
cleaned_desc = []
for doc in all_desc:
    sents = []
    cleaned = re.sub('•', '', doc)
    cleaned = re.sub('-', '', cleaned)
    cleaned_desc += cleaned.strip().lower()
    sents = re.split('\/n|\.', cleaned)
    sents = [sent.strip() for sent in sents if len(sent.strip()) >1]
    cleaned_sents += sents
#     sent_vecs = model.encode(sents)
#     all_sents_vecs.append(sent_vecs)
    all_sents.append(sents)

In [71]:
# cleans further so not just one word sents
cleaned_sents = [text for text in cleaned_sents if len(text.strip()) >0 or text != '']
cleaned_stop_sents = [text for text in no_stop_sents if len(text.strip()) >0 or text != '']

# len(cleaned_text)
len(cleaned_stop_text)

2794

In [72]:
# create topic baseline based on ksa keywords
cleaned_baseline = []
for ksa in baseline:
    text = ' '.join([word for word in ksa.split() if word not in cachedStopWords]).strip()
    text = re.sub('Knowledge', '', text)
    keywords = list(set(text.lower().split(' ')))
    keywords = [word for word in keywords if len(word) >0 or word != '']
    cleaned_baseline.append(keywords)

In [64]:
cleaned_baseline

[['methodologies.',
  'protocols,',
  'computer',
  'networking',
  'security',
  'network',
  'concepts'],
 ['cybersecurity',
  'relate',
  'regulations,',
  'privacy.',
  'laws,',
  'ethics',
  'policies,'],
 ['cybersecurity', 'operational', 'impacts', 'lapses.', 'specific'],
 ['capabilities.', 'vulnerability', 'tools', 'defense', 'cyber', 'assessment'],
 ['key', 'management', 'cryptography', 'concepts', 'cryptographic'],
 ['authorization', 'process.', 'security', 'assessment'],
 ['vulnerability',
  'advisories,',
  'errata,',
  'bulletins).',
  '(e.g.,',
  'information',
  'dissemination',
  'alerts,',
  'sources'],
 ['cybersecurity',
  'requirements',
  'authentication,',
  'non-repudiation).',
  'privacy',
  'integrity,',
  'principles',
  'availability,',
  '(relevant',
  'organizational',
  'confidentiality,'],
 ['risk', '(rmf)', 'framework', 'requirements.', 'management'],
 ['methods',
  'firewalls,',
  'technology',
  'encryption).',
  'principles',
  '(it)',
  '(e.g.,',
  'de

In [67]:
# set seed topic list based on ksa keywords
# desc lists take too long...
topic_model = BERTopic(embedding_model=model,seed_topic_list=cleaned_baseline,nr_topics=18)

topics, probs = topic_model.fit_transform(cleaned_sents)

In [74]:
topic_model_no_stop = BERTopic(embedding_model=model,seed_topic_list=cleaned_baseline,nr_topics=18)

topics_no_stop, probs_no_stop = topic_model_no_stop.fit_transform(cleaned_stop_text)

In [76]:
# gets model info and topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,970,-1_security_and_information_of
1,0,228,0_oral_to_detail_attention
2,1,146,1_at_specialized_federal_level
3,2,137,2_your_resume_describe_transcripts
4,3,114,3_cyber_and_vulnerability_vulnerabilities
5,4,99,4_community_student_spiritual_philanthropic
6,5,93,5_products_their_services_or
7,6,83,6_weeks_52_evaluated_knowledge
8,7,78,7_information_systems_and_access
9,8,72,8_position_each_for_specific


In [80]:
# gets topic words of most importance
topic_model.get_topic(1)

[('at', 0.07119183077715979),
 ('specialized', 0.06610990050934072),
 ('federal', 0.06595932020217464),
 ('level', 0.06279085146474954),
 ('in', 0.05774881905778416),
 ('experience', 0.055488345101313716),
 ('the', 0.055404181022593625),
 ('year', 0.05192533978747588),
 ('equivalent', 0.04892845144513671),
 ('service', 0.048867921197429644)]

In [77]:
topic_model_no_stop.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,996,-1_security_information_level_federal
1,0,210,0_oral_detail_attention_for
2,1,182,1_cyber_vulnerabilities_vulnerability_security
3,2,145,2_community_student_spiritual_religious
4,3,122,3_resume_describe_transcripts_must
5,4,120,4_security_information_methods_systems
6,5,104,5_grade_52_weeks_federal
7,6,93,6_requirements_must_education_in
8,7,91,7____
9,8,90,8_network_systems_security_operating


In [79]:
topic_model_no_stop.get_topic(1)

[('cyber', 0.0642465890180391),
 ('vulnerabilities', 0.05730008784878107),
 ('vulnerability', 0.055622956635209914),
 ('security', 0.040670137997600754),
 ('risks', 0.037282809800865),
 ('specific', 0.03613908406846057),
 ('systems', 0.03600897804681945),
 ('identify', 0.034499694191872536),
 ('risk', 0.03209989900379493),
 ('describing', 0.03148216529016718)]

In [38]:
# cool viz
topic_model.visualize_topics()

In [29]:
# other viz
topic_model.visualize_barchart()

In [30]:
# updates topic model to use bigrams (two words versus single word)
topic_model.update_topics(cleaned_text, topics, n_gram_range=(1, 2))

In [31]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,639,-1_and_security_of_to
1,0,92,0_or___
2,1,90,1_security_information_it_methods
3,2,54,2_four_each of_four competencies_the four
4,3,54,3_cyber_cybersecurity_and_ability to
...,...,...,...
79,78,11,78_training_ready_events_training events
80,79,11,79_application process_occupational_submit an_...
81,80,11,80_subordinate supervisors_supervisors_subordi...
82,81,11,81_transcript_if you_basing all_note transcript


In [85]:
# topic similarity to search string
# may be a good approach to see if any extracted topics match a given ksa search string?
topics, similarity = topic_model.find_topics("Knowledge of computer networking concepts and protocols, and network security methodologies."
                                             , top_n=5)

In [86]:
topic_model.get_topic(topics[0])

[('network', 0.11643941567769868),
 ('systems', 0.08582645501295438),
 ('security', 0.07618274756760286),
 ('and', 0.059439311506625646),
 ('design', 0.04764752877408069),
 ('operating', 0.04468209423003542),
 ('implementation', 0.04268997515061435),
 ('installation', 0.04250142716172592),
 ('of', 0.03978940308956433),
 ('knowledge', 0.03730138835518446)]

In [88]:
similarity[0]

0.4095711491587136