In [37]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity as cs
import pandas as pd
import re
import numpy as np

In [66]:
# example KSA 
baseline =[
   'Knowledge of computer networking concepts and protocols, and network security methodologies.',
'Knowledge of laws, regulations, policies, and ethics as they relate to cybersecurity and privacy.',
'Knowledge of specific operational impacts of cybersecurity lapses.',
'Knowledge of cyber defense and vulnerability assessment tools and their capabilities.',
'Knowledge of cryptography and cryptographic key management concepts',
'Knowledge of Security Assessment and Authorization process.',
'Knowledge of vulnerability information dissemination sources (e.g., alerts, advisories, errata, and bulletins).',
'Knowledge of cybersecurity and privacy principles and organizational requirements (relevant to confidentiality, integrity, availability, authentication, non-repudiation).',
'Knowledge of Risk Management Framework (RMF) requirements.',
'Knowledge of information technology (IT) security principles and methods (e.g., firewalls, demilitarized zones, encryption).',
'Knowledge of network security architecture concepts including topology, protocols, components, and principles (e.g., application of defense-in-depth).',
'Knowledge of security architecture concepts and enterprise architecture reference models (e.g., Zachman, Federal Enterprise Architecture [FEA]).',
'Knowledge of security models (e.g., Bell-LaPadula model, Biba integrity model, Clark-Wilson integrity model)',
'Knowledge of laws, policies, procedures, or governance relevant to cybersecurity for critical infrastructures.',
'Knowledge of embedded systems.',
'Knowledge of penetration testing principles, tools, and techniques.',
'Knowledge of controls related to the use, processing, storage, and transmission of data.',
'Knowledge of Application Security Risks (e.g. Open Web Application Security Project Top 10 list)',

]

In [5]:
# import usajobs dataset and create single description column
file = './Data/Cleaned Data/USAJobs.csv'

In [8]:
df = pd.read_csv(file)

In [9]:
df['desc'] = df['Duties'] + df['Qualifications']

In [10]:
# get sample of 10 for testing
all_desc = df['desc'][0:10].tolist()

In [13]:
# BERT Model name from hugging face 
# https://huggingface.co/models?library=sentence-transformers&pipeline_tag=sentence-similarity&sort=downloads
model_name = 'sentence-transformers/all-MiniLM-L6-v2'

In [15]:
# Initilize Bert Model
model = SentenceTransformer(model_name)

In [16]:
# encodes job descriptions to vectors
vecs = model.encode(all_desc)

In [18]:
vecs.shape

(10, 384)

In [22]:
# Cosine Similarity between job[0] and all other jobs
cs(
    [vecs[0]],
    vecs[1:]
)

array([[0.6536449 , 0.6670726 , 0.6734115 , 0.6433995 , 0.6315858 ,
        0.6253699 , 0.5134918 , 0.65315384, 0.50179183]], dtype=float32)

In [54]:
# Try encoding individual sentences for each job descriptions
all_sents = []
all_sents_vecs = []
doc_means = []
for doc in all_desc:
    sents = []
    sent_means =[]
    sents = re.split('\/n|\.', doc)
    sent_vecs = model.encode(sents)
    for sent in sent_vecs:
        sent_means.append(np.mean(sent))
    doc_means.append(sent_means)
    all_sents_vecs.append(sent_vecs)
    all_sents.append(sents)
    

In [67]:
# vectorize the baseline sample
baseline_vecs = []
for sent in baseline:
    baseline_vecs.append(model.encode(sent))

In [52]:
# Try mean vectors... probably not the best
for doc in all_sents_vecs:
    doc_means.append(np.mean(doc))

In [59]:
# Get lenght of longest vector
maxl = 0
for doc in doc_means:
    if len(doc) > maxl:
        maxl = len(doc)
        
print(maxl)

29


In [63]:
# Standardize the shape of the vectors so all are same length
# make doc vectors same size by using empty arrays for missing size...
for doc in doc_means:
    if len(doc) < 29:
        temp = [0.0 for x in range(29-len(doc))]
        doc += temp

In [62]:
# Cosine similarity of mean vectors...not great results
cs(
    [doc_means[0]], 
    doc_means[1:]

)

array([[-0.08862176,  0.03189544,  0.10708588,  0.03456249,  0.18427945,
         0.44129559, -0.03227523, -0.10908088,  0.29336624]])

In [128]:
# Try using each KSA and compare it to each sentence for each job description
# Find sentences that matches (x > 0.7), similar matches ( 0.7< x >0.5), missing ( x < 0.5)
all_matches = []
all_similar =[]
all_missing =[]
for idx, sent in enumerate(baseline_vecs):
    print('ksa', idx)
    for idx2, job_sents in enumerate(all_sents_vecs):
        print('..job ', idx2)
        matched=[]
        similar=[]
        missing=[]
        val = cs([sent], job_sents)
#         print('cs vals', val)
        for idx3, num in enumerate(val[0]):
            print('.....sent', idx3)
            if num >=0.7:
                print('Possible Sentence Match! for baseline sent ', idx, ' and job ', idx2, ' at ', idx3)
                print(baseline[idx], ' <> ')
                print(all_sents[idx2][idx3])
                matched.append(all_sents[idx2][idx3])
            elif num < 0.7 and num > 0.5:
                similar.append(all_sents[idx2][idx3])
            else:
                missing.append((all_sents[idx2][idx3]))
    all_matches.append(matched)
    all_similar.append(similar)
    all_missing.append(missing)
    print('**\n**')

ksa 0
..job  0
.....sent 0
.....sent 1
.....sent 2
.....sent 3
.....sent 4
.....sent 5
.....sent 6
.....sent 7
.....sent 8
.....sent 9
.....sent 10
.....sent 11
.....sent 12
.....sent 13
.....sent 14
.....sent 15
.....sent 16
.....sent 17
.....sent 18
..job  1
.....sent 0
.....sent 1
.....sent 2
.....sent 3
.....sent 4
.....sent 5
.....sent 6
.....sent 7
.....sent 8
.....sent 9
.....sent 10
.....sent 11
.....sent 12
.....sent 13
.....sent 14
.....sent 15
.....sent 16
.....sent 17
.....sent 18
.....sent 19
.....sent 20
..job  2
.....sent 0
.....sent 1
.....sent 2
.....sent 3
.....sent 4
.....sent 5
.....sent 6
.....sent 7
.....sent 8
.....sent 9
..job  3
.....sent 0
.....sent 1
.....sent 2
.....sent 3
.....sent 4
.....sent 5
.....sent 6
.....sent 7
.....sent 8
.....sent 9
.....sent 10
.....sent 11
.....sent 12
.....sent 13
.....sent 14
.....sent 15
.....sent 16
.....sent 17
.....sent 18
.....sent 19
.....sent 20
.....sent 21
..job  4
.....sent 0
.....sent 1
.....sent 2
.....sent 3
.....

.....sent 0
.....sent 1
.....sent 2
.....sent 3
.....sent 4
.....sent 5
.....sent 6
.....sent 7
.....sent 8
.....sent 9
.....sent 10
.....sent 11
.....sent 12
.....sent 13
.....sent 14
.....sent 15
.....sent 16
.....sent 17
.....sent 18
.....sent 19
.....sent 20
.....sent 21
.....sent 22
.....sent 23
.....sent 24
.....sent 25
.....sent 26
.....sent 27
.....sent 28
..job  9
.....sent 0
.....sent 1
.....sent 2
.....sent 3
.....sent 4
.....sent 5
.....sent 6
.....sent 7
.....sent 8
.....sent 9
.....sent 10
.....sent 11
.....sent 12
.....sent 13
.....sent 14
.....sent 15
.....sent 16
.....sent 17
.....sent 18
.....sent 19
.....sent 20
**
**
ksa 4
..job  0
.....sent 0
.....sent 1
.....sent 2
.....sent 3
.....sent 4
.....sent 5
.....sent 6
.....sent 7
.....sent 8
.....sent 9
.....sent 10
.....sent 11
.....sent 12
.....sent 13
.....sent 14
.....sent 15
.....sent 16
.....sent 17
.....sent 18
..job  1
.....sent 0
.....sent 1
.....sent 2
.....sent 3
.....sent 4
.....sent 5
.....sent 6
.....sent 

.....sent 0
.....sent 1
.....sent 2
.....sent 3
.....sent 4
.....sent 5
.....sent 6
.....sent 7
.....sent 8
.....sent 9
.....sent 10
.....sent 11
.....sent 12
.....sent 13
.....sent 14
.....sent 15
.....sent 16
.....sent 17
.....sent 18
.....sent 19
.....sent 20
.....sent 21
..job  7
.....sent 0
.....sent 1
.....sent 2
.....sent 3
.....sent 4
.....sent 5
.....sent 6
.....sent 7
.....sent 8
.....sent 9
.....sent 10
.....sent 11
.....sent 12
.....sent 13
.....sent 14
.....sent 15
.....sent 16
.....sent 17
.....sent 18
.....sent 19
.....sent 20
.....sent 21
.....sent 22
.....sent 23
.....sent 24
.....sent 25
..job  8
.....sent 0
.....sent 1
.....sent 2
.....sent 3
.....sent 4
.....sent 5
.....sent 6
.....sent 7
.....sent 8
.....sent 9
.....sent 10
.....sent 11
.....sent 12
.....sent 13
.....sent 14
.....sent 15
.....sent 16
.....sent 17
.....sent 18
.....sent 19
.....sent 20
.....sent 21
.....sent 22
.....sent 23
.....sent 24
.....sent 25
.....sent 26
.....sent 27
.....sent 28
..job  9
..

.....sent 0
.....sent 1
.....sent 2
.....sent 3
.....sent 4
.....sent 5
.....sent 6
.....sent 7
.....sent 8
.....sent 9
.....sent 10
.....sent 11
.....sent 12
.....sent 13
.....sent 14
.....sent 15
.....sent 16
.....sent 17
.....sent 18
..job  6
.....sent 0
.....sent 1
.....sent 2
.....sent 3
.....sent 4
.....sent 5
.....sent 6
.....sent 7
.....sent 8
.....sent 9
.....sent 10
.....sent 11
.....sent 12
.....sent 13
.....sent 14
.....sent 15
.....sent 16
.....sent 17
.....sent 18
.....sent 19
.....sent 20
.....sent 21
..job  7
.....sent 0
.....sent 1
.....sent 2
.....sent 3
.....sent 4
.....sent 5
.....sent 6
.....sent 7
.....sent 8
.....sent 9
.....sent 10
.....sent 11
.....sent 12
.....sent 13
.....sent 14
.....sent 15
.....sent 16
.....sent 17
.....sent 18
.....sent 19
.....sent 20
.....sent 21
.....sent 22
.....sent 23
.....sent 24
.....sent 25
..job  8
.....sent 0
.....sent 1
.....sent 2
.....sent 3
.....sent 4
.....sent 5
.....sent 6
.....sent 7
.....sent 8
.....sent 9
.....sent 10

.....sent 15
.....sent 16
.....sent 17
.....sent 18
.....sent 19
.....sent 20
..job  2
.....sent 0
.....sent 1
.....sent 2
.....sent 3
.....sent 4
.....sent 5
.....sent 6
.....sent 7
.....sent 8
.....sent 9
..job  3
.....sent 0
.....sent 1
.....sent 2
.....sent 3
.....sent 4
.....sent 5
.....sent 6
.....sent 7
.....sent 8
.....sent 9
.....sent 10
.....sent 11
.....sent 12
.....sent 13
.....sent 14
.....sent 15
.....sent 16
.....sent 17
.....sent 18
.....sent 19
.....sent 20
.....sent 21
..job  4
.....sent 0
.....sent 1
.....sent 2
.....sent 3
.....sent 4
.....sent 5
.....sent 6
.....sent 7
.....sent 8
.....sent 9
.....sent 10
.....sent 11
.....sent 12
.....sent 13
.....sent 14
.....sent 15
.....sent 16
.....sent 17
.....sent 18
.....sent 19
.....sent 20
.....sent 21
.....sent 22
..job  5
.....sent 0
.....sent 1
.....sent 2
.....sent 3
.....sent 4
.....sent 5
.....sent 6
.....sent 7
.....sent 8
.....sent 9
.....sent 10
.....sent 11
.....sent 12
.....sent 13
.....sent 14
.....sent 15
...

In [129]:
# KSA baseline[6] matches Job[2] Sentence[2]
print(baseline[6])
print(all_sents[2][2])

Knowledge of vulnerability information dissemination sources (e.g., alerts, advisories, errata, and bulletins).
Conducts risk and vulnerability assessments of planned and installed information systems changes to identify vulnerabilities and protection needs and develops corresponding recommendations to reduce or eliminate risk


In [144]:
sent_total = 0
for x in all_sents:
    sent_total += len(x)
print(sent_total)

212


In [148]:
# I think my math is off / wrong formula... may have to loop differently / re-do lists???
for idx, val in enumerate(baseline):
    print('ksa ', idx)
    matched_score = len(all_matches[idx])/(len(all_desc) + len(baseline))
    similar_score = len(all_similar[idx])/(len(all_desc) + len(baseline))
#     missing_score = len(all_missing[idx])/(len(all_desc)+ len(baseline))
    missing_score = 1 -(matched_score + similar_score)
    print('Match Score ==', matched_score)
    print('Similar Score == ', similar_score)
    print('Missing Score ==', missing_score)

ksa  0
Match Score == 0.0
Similar Score ==  0.0
Missing Score == 1.0
ksa  1
Match Score == 0.0
Similar Score ==  0.0
Missing Score == 1.0
ksa  2
Match Score == 0.0
Similar Score ==  0.03571428571428571
Missing Score == 0.9642857142857143
ksa  3
Match Score == 0.0
Similar Score ==  0.03571428571428571
Missing Score == 0.9642857142857143
ksa  4
Match Score == 0.0
Similar Score ==  0.0
Missing Score == 1.0
ksa  5
Match Score == 0.0
Similar Score ==  0.0
Missing Score == 1.0
ksa  6
Match Score == 0.0
Similar Score ==  0.03571428571428571
Missing Score == 0.9642857142857143
ksa  7
Match Score == 0.0
Similar Score ==  0.0
Missing Score == 1.0
ksa  8
Match Score == 0.0
Similar Score ==  0.0
Missing Score == 1.0
ksa  9
Match Score == 0.0
Similar Score ==  0.0
Missing Score == 1.0
ksa  10
Match Score == 0.0
Similar Score ==  0.0
Missing Score == 1.0
ksa  11
Match Score == 0.0
Similar Score ==  0.0
Missing Score == 1.0
ksa  12
Match Score == 0.0
Similar Score ==  0.0
Missing Score == 1.0
ksa  13