In [2]:
import pandas as pd 
import numpy as np
from bertopic import BERTopic 
import nltk 
from nltk.corpus import stopwords 
import re
import string

#from nltk.tokenize import word_tokenize
#from nltk.stem import WordNetLemmatizer

In [21]:
#switched to indeed for better interpretation 
df = pd.read_csv("C:/Users/hanna/Scape-Save-DAEN690/Datasets/indeed_cleaned.csv.gz")
#df = df[0:500] #to minimize processing time while testing the code 

# NICE framework 
df2 = pd.read_csv("/Users/hanna/Scape-Save-DAEN690/Datasets/nice.csv",encoding='cp1252')

In [22]:
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer
#import spacy_transformers
#import spacy

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

#spacy.prefer_gpu()
#nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - Fine-tune topic representations with 
# reduces redundacy and improve diversity of keywords using MMR 
representation_model = MaximalMarginalRelevance(0.3)

# All steps together with indeed data 
topic_model = BERTopic(
  embedding_model=embedding_model,          # Step 1 - Extract embeddings
  umap_model=umap_model,                    # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
  representation_model=representation_model) # Step 6 - Fine-tune topic represenations

# All steps together with NICE framework data 
topic_model_nice = BERTopic(
  embedding_model=embedding_model,          # Step 1 - Extract embeddings
  umap_model=umap_model,                    # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,              # Step 3 - Cluster                                                                                                                 r reduced embeddings
  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
  representation_model=representation_model) # Step 6 - Fine-tune topic represenations

In [23]:
topics, probs = topic_model.fit_transform(df['description_text'])

2023-03-14 10:03:10,120 - BERTopic - Transformed documents to Embeddings
2023-03-14 10:03:53,847 - BERTopic - Reduced dimensionality
2023-03-14 10:04:02,484 - BERTopic - Clustered reduced embeddings


In [None]:
topics, probs = topic_model_nice.fit_transform(df2['ksat'])

In [24]:
#refined extracted topics 
topic_model.visualize_barchart(top_n_topics=5)

In [26]:
freq = topic_model.get_topic_info(); freq.head(5)
topic_model.get_topic(0)  # Select the most frequent topic

[('faculty', 0.50694627),
 ('courses', 0.44617814),
 ('curriculum', 0.4403103),
 ('academic', 0.4222192),
 ('student', 0.4181387),
 ('qualifications', 0.41637185),
 ('applicants', 0.40955865),
 ('campus', 0.40892303),
 ('semester', 0.40400717),
 ('instructor', 0.39675337)]

In [27]:
#topic -1 are noise that can hurt the topic representations 
freq 

Unnamed: 0,Topic,Count,Name
0,-1,12518,-1_cybersecurity_security_role_qualifications
1,0,599,0_faculty_courses_curriculum_academic
2,1,395,1_mitre_fulfilling_workplace_jobs
3,2,385,2_cybersecurity_security_officer_requirements
4,3,368,3_crowdstrike_cyberattacks_wearecrowdstrike_leads
...,...,...,...
595,594,16,594_securitycenter_leidos_mcafee_sddc
596,595,15,595_clients_client_engineers_architect
597,596,15,596_azure_security_compliance_cloud
598,597,15,597_robotic_assisted_patients_healthcare


In [13]:
#reduce outliers and update topics before fitting the updated model 
#source: https://maartengr.github.io/BERTopic/getting_started/outlier_reduction/outlier_reduction.html#exploration
#new_topic= topic_model.reduce_outliers(df['description_text'], topics)
#topic_model.update_topics(df['description_text'], topics=new_topic)

In [None]:
#sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
#embeddings = sentence_model.encode(df['description_text'], show_progress_bar=True)
#reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

#topic_model.visualize_documents(df['description_text'], reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=True, hide_annotations=True)

In [None]:
#hierarchical_topics = topic_model.hierarchical_topics(df['description_text'])
#topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [18]:
#refined extracted topics 
topic_model.visualize_barchart(top_n_topics=5)

In [19]:
topic_model.get_topic(0) # Select the most frequent topic 

[('rmf', 0.005815029761925889),
 ('system', 0.003802619577059121),
 ('clearance', 0.0036727186275768726),
 ('assessment', 0.003389592901501779),
 ('authorization', 0.003233354651688509),
 ('assurance', 0.0030899108012760134),
 ('secret', 0.0030761762156648414),
 ('government', 0.002945599485343047),
 ('emass', 0.0029316719461860066),
 ('systems', 0.0029246047722196796)]

In [None]:
# Finding similar topics between models using cosine similarity 

from sklearn.metrics.pairwise import cosine_similarity

sim_matrix = cosine_similarity(topic_model.topic_embeddings_, topic_model_nice.topic_embeddings_)
topic = 0 # Select the most frequent topic 
topic_model.get_topic(topic)

most_similar_topic = np.argmax(sim_matrix[topic + 1])-1
topic_model_nice.get_topic(most_similar_topic) #compares nice framework model to indeed model and identify what topics are most similar to given topic 

