### Topic Modeling: Demand for Cybersecurity Professionals  

In [8]:
import pandas as pd 
import numpy as np
import nltk 
from nltk.corpus import stopwords 
import re
import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#### Data Preprocessing 
Indeed Data 

In [33]:
df = pd.read_csv("C:/Users/hanna/Scape-Save-DAEN690/Datasets/indeed_cleaned.csv.gz")

In [34]:
#Remove duplicate postings if the job title, description and location are the same 
df.drop_duplicates(subset=['job_title', 'description_text', 'location'], inplace=True)

In [35]:
#Set to lower case so capitalized titles don't get counted separately
df['job_title'] = df['job_title'].apply(lambda x: x.lower())
#remove numerical values from job description
df['description_text'] = df['description_text'].str.replace('\d+', '')
#remove links from job description 
df['description_text'] = df['description_text'].str.replace('http\S+|www.\S+', '', case=False)

df['description'] = df['description'].apply(lambda x: str(x).lower())


In [37]:
#Create function to categorize role types 

def role_type(x):
    if 'lead' in x:
        return 'senior'
    elif 'senior' in x:
        return 'senior'
    elif 'sr' in x:
        return 'senior'
    elif 'chief' in x:
        return 'upper mgmt'
    elif 'manager' in x:
        return 'upper mgmt'
    elif 'principal' in x:
        return 'upper mgmt'
    elif 'junior' in x:
        return 'entry-level'
    elif 'jr' in x:
        return 'entry-level'
    elif 'entry-level' in x:
        return 'entry-level'
    elif 'entry level' in x:
        return 'entry-level'
    elif 'professor' in x: 
        return 'academia'
    else:
        return 'regular'
    
df['role_type'] = df['job_title'].apply(role_type)

#Can be used for visualizations 
job_df = df.groupby('role_type').count().reset_index()[['role_type', 'job_title']]


In [38]:
#Comparing job descriptions using cosine similarity will help remove job descriptions that are almost identical with additional word difference 
#Helps reduce processing time of the model 

corpus= df['description_text']

count_vect = CountVectorizer()

x_counts = count_vect.fit_transform(corpus)

cos_df = pd.DataFrame(cosine_similarity(x_counts))
 
i, j = np.indices(cos_df.shape).reshape(2, -1)

cos_values = cos_df.values.reshape(-1)

cos_sim_df = pd.DataFrame({'i': i, 'j': j, 'score':cos_values})

#similarity score >0.99 indicates that the job descriptions are very similar with slight variations in wordings
cos_rem = cos_sim_df[(cos_sim_df['score']>0.99)&(i!=j)]

cos_rem['i*j'] = cos_rem['i'] * cos_rem['j']
drop_rows = np.unique(cos_rem.drop_duplicates(subset=['i*j'], keep='first')['i'].values)

#Drop rows with >0.99 similarity score from original dataframe
df = df[~df.index.isin(drop_rows)]
df = df.reset_index()
df.drop(columns={'index'}, inplace=True)

CareerOneStop Data

In [60]:
cos_df = pd.read_excel("C:/Users/hanna/Scape-Save-DAEN690/Datasets/careeronestop_data.xlsx") 

In [61]:
cos_df.drop_duplicates(subset=['job_title', 'description', 'location'], inplace=True)
#Set to lower case so capitalized titles don't get counted separately
cos_df['job_title'] = cos_df['job_title'].apply(lambda x: str(x).lower())

In [63]:
#Set to lower case so capitalized titles don't get counted separately
cos_df['job_title'] = cos_df['job_title'].apply(lambda x: x.lower())
#remove numerical values from job description
cos_df['description'] = cos_df['description'].str.replace('\d+', '')
#remove links from job description 
cos_df['description'] = cos_df['description'].str.replace('http\S+|www.\S+', '', case=False)

cos_df['description'] = cos_df['description'].apply(lambda x: str(x).lower())

In [64]:
#apply function to add job roles 
cos_df['role_type'] = cos_df['job_title'].apply(role_type)

job_df = cos_df.groupby('role_type').count().reset_index()[['role_type', 'job_title']]

In [65]:
cos_df.shape

(8920, 8)

In [66]:
corpus= cos_df['description']

count_vect = CountVectorizer()

x_count_cos = count_vect.fit_transform(corpus.values.astype('U'))

cosi_df = pd.DataFrame(cosine_similarity(x_count_cos))

i, j = np.indices(cosi_df.shape).reshape(2, -1)

cos_values = cosi_df.values.reshape(-1)

cos_sim_df = pd.DataFrame({'i': i, 'j': j, 'score':cos_values})

#similarity score >0.99 indicates that the job descriptions are very similar with slight variations in wordings 
cos_rem = cos_sim_df[(cos_sim_df['score']>0.99)&(i!=j)]

cos_rem['i*j'] = cos_rem['i'] * cos_rem['j']
drop_rows = np.unique(cos_rem.drop_duplicates(subset=['i*j'], keep='first')['i'].values)

cos_df = cos_df[~cos_df.index.isin(drop_rows)]
cos_df = cos_df.reset_index()
cos_df.drop(columns={'index'}, inplace=True)

#### BERTopic Algorithm

In [76]:
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic 
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
# ngram_range set to 2 so the topics extracted can also be 'two-words' instead of one 
vectorizer_model = CountVectorizer(ngram_range=(1,2), stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - Fine-tune topic representations with 
# reduces redundacy and improve diversity of keywords using MMR 
representation_model = MaximalMarginalRelevance(0.4)

topic_model = BERTopic(
  embedding_model=embedding_model,         
  umap_model=umap_model,                    
  hdbscan_model=hdbscan_model,              
  vectorizer_model=vectorizer_model,        
  ctfidf_model=ctfidf_model,                
  representation_model=representation_model) 

topic_model_in = BERTopic(
  embedding_model=embedding_model,          
  umap_model=umap_model,                    
  hdbscan_model=hdbscan_model,              
  vectorizer_model=vectorizer_model,       
  ctfidf_model=ctfidf_model,                
  representation_model=representation_model) 

In [77]:
#CareerOneStop model
topic, prob = topic_model.fit_transform(cos_df['description'].values.astype('U'))

In [40]:
#Indeed Model
topic_in, prob_in = topic_model_in.fit_transform(df['description_text'])

In [79]:
#model with careeronestop data
topic_model.visualize_barchart(top_n_topics=5)

In [41]:
#model with indeed data 
topic_model_in.visualize_barchart(top_n_topics=5)

In [None]:
freq = topic_model_in.get_topic_info() 
print(freq)

     Topic  Count                                               Name
0       -1   3441  -1_experience_management_cybersecurity_informa...
1        0    120   0_aws_aws security_leadership principles_balance
2        1    110  1_dod_officer isso_security officer_systems se...
3        2    105                  2_dod_clearance_certification_rmf
4        3    103             3_teaching_students_curriculum_courses
..     ...    ...                                                ...
551    550      5  550_disaster recovery_healthplan_business cont...
552    551      5  551_commscope_application security_security te...
553    552      5    552_noblis_security evaluation_dhs_testing firm
554    553      5  553_onsemi_ensure asg_credentials_security del...
555    554      5  554_aci_aci products_security testing_security...

[556 rows x 3 columns]


In [59]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,875,-1_experience_systems_management_cybersecurity
1,0,115,0_architect_aws_azure_solutions
2,1,79,1_security_management_experience_analyst
3,2,37,2_soc_soc analyst_operations center_incidents
4,3,37,3_requirements gdit_mandate_gdit people_covid ...
...,...,...,...
130,129,5,129_shi_identity access_oracle_management
131,130,5,130_embedded systems_efforts identify_agco_cyb...
132,131,5,131_physical security_entergy companies_networ...
133,132,5,132_lockheed martin_schedules_based states_dan...


In [None]:
#keywords present in our top topics 
topic_model.get_topic(freq.iloc[15]["Topic"])

[('compensation range', 0.024140114128362616),
 ('pwc', 0.017618603521963337),
 ('dependent individual', 0.01607998665091447),
 ('individual skills', 0.015990080352396064),
 ('eligible annual', 0.015182537795242014),
 ('experience qualifications', 0.01452364027398087),
 ('discretionary bonus', 0.014218430535273887),
 ('pricewaterhousecoopers itservices', 0.013535201291892783),
 ('residing', 0.010969723946617235),
 ('199 000', 0.010648478187735717)]