In [3]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
import pandas as pd
import numpy as np
import time
import gensim.corpora as corpora
script_start_time = time.time()


folder_path = "YOUR_FILE_PATH"

In [4]:
def replaceKeyword(x, keyword_list):
    '''
    Input:
    - x: input string
    - keyword_list: list of keywords to replace with
    Output:
    - label: string of fixed word. If keywords do not exist in string, return None
    '''
    label=None
    for keyword in keyword_list:
        if label == None:
            try: 
                if keyword in x:
                    label=keyword
            except:
                label=None
    
    if label== None:
        return "other"
    else:
        return label


def simplifyJobs(df):
    '''
    Input:
    - df: dataframe of job postings
    Output:
    - docs: list of strings representing input documents. The job postings are simplified.
    '''
    shortened_dict = {'Machine Learning': 'ML', 'Business Intelligence': 'BI', 'Developer Operations': 'DevOps', 'Artificial Intelligence': 'AI'}
    job_titles=['DataAnalyst','DataEngineer','DataScientist','MachineLearningEngineer', 'BIAnalyst',
           'AIEngineer','SoftwareEngineer','DevOpsEngineer','Architect','BigDataEngineer', 'ResearchScientist']
    df = df.dropna(thresh = 5)
    df['Job Title'] = df['Job Title'].str.replace(' ', '') # shorten job title
    df['Job Title'] = df['Job Title'].replace(shortened_dict, regex = True)
    df['JobTitleClean'] = pd.Series( [replaceKeyword(x, job_titles ) for x in df['Job Title']])
    df['JobTitleClean'] = df['JobTitleClean'].fillna("other") # other for all jobs not in title
    df['Requirment of the company '] = df['Requirment of the company '].fillna('Unknown Company')
    df['Experience level'] = df['Experience level'].fillna('Unknown Experience Level')
    df['posting'] = df['JobTitleClean'] + " " + df['Requirment of the company '] + " " + df['Experience level'] # not including company
    df['posting'] = df['posting'].str.replace(",", " ")
    df['posting'] = df['posting'].str.replace("-", "")
    docs = df['posting'].to_list()
    return docs


In [5]:
# Embedding model: See [1] for more details 
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Clustering model: See [2] for more details
cluster_model = HDBSCAN(min_cluster_size = 15, 
                        metric = 'euclidean', 
                        cluster_selection_method = 'eom', 
                        prediction_data = True)



modules.json: 100%|█████████████████████████████████████████████████████████████████████████████████| 349/349 [00:00<00:00, 120kB/s]
config_sentence_transformers.json: 100%|███████████████████████████████████████████████████████████| 116/116 [00:00<00:00, 29.0kB/s]
README.md: 100%|███████████████████████████████████████████████████████████████████████████████| 10.7k/10.7k [00:00<00:00, 2.13MB/s]
sentence_bert_config.json: 100%|█████████████████████████████████████████████████████████████████| 53.0/53.0 [00:00<00:00, 13.2kB/s]
config.json: 100%|██████████████████████████████████████████████████████████████████████████████████| 612/612 [00:00<00:00, 150kB/s]
pytorch_model.bin: 100%|███████████████████████████████████████████████████████████████████████| 90.9M/90.9M [00:22<00:00, 4.00MB/s]
tokenizer_config.json: 100%|███████████████████████████████████████████████████████████████████████| 350/350 [00:00<00:00, 70.2kB/s]
vocab.txt: 100%|█████████████████████████████████████████████████████

In [6]:
df = pd.read_csv('ai-jobs_data_science_job.csv', encoding = 'latin-1')

docs = simplifyJobs(df)
docs

['DataAnalyst Computer Science Data quality Genetics Mathematics SAS Statistics Entrylevel',
 'DataAnalyst Agile Data management Finance Security   Entrylevel',
 'MachineLearningEngineer Agile Architecture AWS Computer Science Computer Vision Deep Learning Unknown Experience Level',
 'DataAnalyst Engineering Industrial Oracle Power BI R R&D Entrylevel',
 'DataEngineer AWS Azure Computer Science Consulting Dataflow Data pipelines Midlevel',
 'DataScientist Computer Science Deep Learning Industrial Machine Learning NLP NumPy Seniorlevel',
 'other Banking Data quality Excel Security   Entrylevel',
 'other Business Intelligence Excel Genetics    Entrylevel',
 'DataEngineer Big Data Computer Science Engineering Machine Learning Mathematics Matlab Entrylevel',
 'DataEngineer Agile APIs AWS Azure Big Data Computer Science Seniorlevel',
 'DataScientist Computer Science Data analysis Deep Learning Keras Machine Learning PhD Entrylevel',
 'DataAnalyst Business Analytics Business Intelligence Dat

In [7]:
# BERTopic model
ctfidf_model = ClassTfidfTransformer()
load_model_time = time.time()
topic_model = BERTopic(embedding_model = embedding_model,
                       calculate_probabilities=True,
                       hdbscan_model = cluster_model,
                       nr_topics = 10,
                       ctfidf_model = ctfidf_model  # extract topic words wihto TF-IDF, but instead of term-doc calculate as term-topic)
                      )
end_load_model = time.time()
print("Total time to load model: ", end_load_model - load_model_time)

Total time to load model:  0.0


In [8]:
# Fit the model on a corpus
load_model_time = time.time()
topics, probs = topic_model.fit_transform(docs)
end_load_model = time.time()
print("Total time to load model: ", end_load_model - load_model_time)


Total time to load model:  20.830671072006226


In [10]:
# Visualization examples: See [3] for more details
load_model_time = time.time()
# Save intertopic distance map as HTML file
# topic_model.visualize_topics().write_html(folder_path +"intertopic_dist_map.html")
topic_model.visualize_topics().write_html("intertopic_dist_map.html")
print("Total time to load model: ", end_load_model - load_model_time)

Total time to load model:  -182.11826848983765


In [12]:
load_model_time = time.time()
# Save topic-terms barcharts as HTML file
topic_model.visualize_barchart(top_n_topics = 25).write_html("barchart.html")
end_load_model = time.time()
print("Total time to load model: ", end_load_model - load_model_time)


Total time to load model:  0.0658717155456543


In [14]:
load_model_time = time.time()
# Save documents projection as HTML file
topic_model.visualize_documents(docs).write_html("projections.html")

# Save topics dendrogram as HTML file
topic_model.visualize_hierarchy().write_html("hieararchy.html")
end_load_model = time.time()
print("Total time to load model: ", end_load_model - load_model_time)


Total time to load model:  13.11623740196228


In [15]:
# Evaluate model 
# Reference: https://github.com/MaartenGr/BERTopic/issues/90
documents = pd.DataFrame({"Document": docs,
                          "ID": range(len(docs)),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out() # earlier version of this function is get_feature_names()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model_bertopic = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')
coherence_bertopic = coherence_model_bertopic.get_coherence()
print("Bertopic Coherence C_V is: {0}".format(coherence_bertopic))

Bertopic Coherence C_V is: 0.4306240670898964


In [16]:
df = topic_model.get_topic_info()
df.to_csv('output.csv')
df

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,955,-1_data_other_engineering_seniorlevel,"[data, other, engineering, seniorlevel, learni...",[other AWS Computer Science Data Mining Excel ...
1,0,1301,0_data_aws_seniorlevel_azure,"[data, aws, seniorlevel, azure, architecture, ...",[DataEngineer Agile AWS Big Data Computer Scie...
2,1,375,1_learning_machine_deep_computer,"[learning, machine, deep, computer, science, e...",[DataAnalyst Computer Science Computer Vision ...
3,2,241,2_entrylevel_business_intelligence_excel,"[entrylevel, business, intelligence, excel, py...",[DataEngineer Business Intelligence Data analy...
4,3,95,3_docker_ansible_aws_ci,"[docker, ansible, aws, ci, cd, azure, computer...",[DataScientist AWS Azure CI/CD Computer Scienc...
5,4,79,4_blockchain_crypto_banking_seniorlevel,"[blockchain, crypto, banking, seniorlevel, sci...",[other Banking Blockchain Computer Science Cry...
6,5,58,5_chatbots_robotics_seniorlevel_engineering,"[chatbots, robotics, seniorlevel, engineering,...",[DataAnalyst Agile APIs Architecture Chatbots ...
7,6,41,6_privacy_machine_research_learning,"[privacy, machine, research, learning, enginee...",[other Big Data Engineering Genetics Machine L...
8,7,36,7_causal_inference_economics_machine,"[causal, inference, economics, machine, engine...",[DataScientist Causal inference Computer Scien...
9,8,16,8_angular_testing_javascript_apis,"[angular, testing, javascript, apis, react, bi...",[DataAnalyst A/B testing Angular APIs Computer...


In [17]:
end_time = time.time()
print("Total script time: ",  end_time-script_start_time )

Total script time:  338.68249773979187
