In [None]:
# Run this for first install of packages!
# !pip install -r requirements.txt

In [None]:
# Imports
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import re
import numpy as np
import helpers # this is a Joe G. created helper file of functions

In [None]:
# Params / Files to change 
input_file = '../data/cleaned_data/USAJobs.csv' # change to whatever file/filepath you are using
output_file = '../data/results/usa_jobs/similarity_match/batch1.csv' # change to your outpath
desc_column = 'Duties'
start_idx = 0 # file row to start at
end_idx = 100 # file row to end at
#how many jobs do you want to search / score against? make start_idx -1 if you want to use entire file

In [None]:
# Initialize Pre-trained model 
# downloads automatically from hugging face
model_name = 'sentence-transformers/paraphrase-distilroberta-base-v2'
model = SentenceTransformer(model_name)

In [None]:
# file for our cyber baseline
# hopefully can use embedding file later
# ? Is Knowledge_Units.csv the final baseline file? 
baseline_file = '../data/cleaned_data/KUKSAT_Baseline.csv'
ksa_col = 'KUKSAT'

In [None]:
# read baselne_file and convert to df
df_base = pd.read_csv(baseline_file, encoding='ISO-8859-1')
ksas = df_base[ksa_col].tolist()

In [None]:
# how many ksas
len(df_base)

In [None]:
# read input file and covert to df
# covert target description column to list
df = pd.read_csv(input_file)
if start_idx >= 0 and start_idx != end_idx:
    jobs = df[start_idx:end_idx]
else:
    jobs = df.copy()
jobs = jobs[desc_column].tolist()

In [None]:
# how many jobs
len(jobs)

In [None]:
# apply text cleanup functions to jobs and ksa base lists
jobs = helpers.cleanup_text(jobs)
ksas = helpers.cleanup_text(ksas)

In [None]:
# split jobs to sentence level
job_sent = helpers.split_sents(jobs)

In [None]:
# gets vector embeddings for the ksa baselines
baseline_vecs = model.encode(ksas)

In [None]:
# To save baseline embeddings uncomment here.
helpers.save_embeddings(baseline_vecs, '../data/saved_embeddings/baseline.pickle')

In [None]:
# encode all job sentences
all_job_vecs = [model.encode(sent) for sent in job_sent]

In [None]:
# To save job sentence embeddings
helpers.save_embeddings(baseline_vecs, '../data/saved_embeddings/usajob_test.pickle')

In [None]:
# Try using each KSA and compare it to each sentence for each job description
# Find sentences that matches (x >= 0.6), similar matches ( 0.6< x >0.4), missing ( x < 0.4)
all_matches = []
all_similar =[]
all_missing =[]
# Loops through baseline vectors embeddings of the ksas
for idx, ksa in enumerate(baseline_vecs):
    print('ksa', idx, '_'*10)
    matched=[]
    similar=[]
    missing=[]
#   loops through all the job sentence vector embeddings
    for idx2, sent in enumerate(all_job_vecs):
        print('..job ', idx2)
#         evaluates the cosine similarity between the ksa and the job sentences
        val = cosine_similarity([ksa], sent)
    
#     Loops through the scores to determine if matched, similar, or missing
        for idx3, num in enumerate(val[0]):
#         created a temp dictionarty of key info and then appends that to the appropriate list
            temp = {}
#     if ksa text in sentence text then match too
            if num >=0.6 or (ksas[idx].lower().lstrip().strip() in job_sent[idx2][idx3].lower().lstrip().strip()):
                temp['job_idx'] = idx2
                temp['sentence_idx'] = idx3
                temp['ksa_idx'] = idx
                temp['sentence_text'] = job_sent[idx2][idx3]
                temp['ksa_text'] = ksas[idx]
                temp['sim_score'] = num
                matched.append(temp)
            elif num < 0.6 and num > 0.4:
                temp['job_idx'] = idx2
                temp['sentence_idx'] = idx3
                temp['ksa_idx'] = idx
                temp['sentence_text'] = job_sent[idx2][idx3]
                temp['ksa_text'] = ksas[idx]
                temp['sim_score'] = num
                similar.append(temp)
            else:
                temp['job_idx'] = idx2
                temp['sentence_idx'] = idx3
                temp['ksa_idx'] = idx
                temp['sentence_text'] = job_sent[idx2][idx3]
                temp['ksa_text'] = ksas[idx]
                temp['sim_score'] = num
                missing.append(temp)
    all_matches.append(matched)
    all_similar.append(similar)
    all_missing.append(missing)
    print('**\n**')

In [None]:
# Calculates total sentences compared 
sent_total = 0
for x in job_sent:
    sent_total += len(x)
print(sent_total)

In [None]:
# calculates aggregate scores for matching, similar, missing
# uses sentence total 
# ? OR should we do just len(job) ?
ksa_agg_matches = []
ksa_agg_similar = []
ksa_agg_missing = []
for idx, val in enumerate(baseline_vecs):
    print('ksa', idx, '_'*10)
    matched_score = len(all_matches[idx])/sent_total
    similar_score = len(all_similar[idx])/sent_total
    missing_score = 1 -(matched_score + similar_score)
    ksa_agg_matches.append(matched_score)
    ksa_agg_similar.append(similar_score)
    ksa_agg_missing.append(missing_score)
    print('Match Score ==', matched_score)
    print('Similar Score == ', similar_score)
    print('Missing Score ==', missing_score)

In [None]:
# dataframes all results
final_df = pd.DataFrame({
    'ksa': ksas,
    'matches': all_matches,
    'similar': all_similar,
    'missing': all_missing,
    'matched_score': ksa_agg_matches,
    'similar_score': ksa_agg_similar,
    'missing_score': ksa_agg_missing
})

In [None]:
# save to output_file
final_df.to_csv(output_file, index=False)

In [None]:
# For local visual / spot check sorts by matched, similar, missing score
final_df.sort_values(by=['matched_score', 'similar_score', 'missing_score'], ascending=False)

In [None]:
# For local visual / spot check prints ksa's 1 matches
for idx, row in final_df.iterrows():
    print('ksas===', row['ksa'], ' \n ')
    for match in row['matches']:
        print('matched ===', match['sentence_text'])
    break