In [1]:

'''
Topic Modeling with LDA: Minimum Viable Example
References:
[1] LDA with Gensim: https://radimrehurek.com/gensim/models/ldamodel.html
[2] Visualization with pyLDAvis: https://pypi.org/project/pyLDAvis/
'''

# Import dependencies
import pandas as pd
import numpy as np
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.utils import tokenize as gensim_tokenize
import spacy
import pyLDAvis
import pyLDAvis.gensim_models

import warnings
import time
spacy.load("en_core_web_sm")
warnings.filterwarnings("ignore", category=DeprecationWarning)
pd.set_option('mode.chained_assignment', None)
import en_core_web_sm


In [2]:
def replaceKeyword(x, keyword_list):
    '''
    Input:
    - x: input string
    - keyword_list: list of keywords to replace with
    Output:
    - label: string of fixed word. If keywords do not exist in string, return None
    '''
    label=None
    for keyword in keyword_list:
        if label == None:
            try: 
                if keyword in x:
                    label=keyword
            except:
                label=None
    
    if label== None:
        return ""
    else:
        return label


def simplifyJobs(df):
    '''
    Input:
    - df: dataframe of job postings
    Output:
    - docs: list of strings representing input documents. The job postings are simplified.
    '''
    # shortened_dict = {'Machine Learning': 'ML', 'Business Intelligence': 'BI', 'Developer Operations': 'DevOps', 'Artificial Intelligence': 'AI'}
    job_titles=['DataAnalyst','DataEngineer','DataScientist','MachineLearningEngineer', 'BIAnalyst', 'AIEngineer','SoftwareEngineer','DevOpsEngineer','Architect','BigDataEngineer', 'ResearchScientist']
    df = df.dropna(thresh = 5)
    df['Job Title'] = df['Job Title'].str.replace(' ', '')
    # df['Job Title'] = df['Job Title'].replace(shortened_dict, regex = True)
    df['JobTitleClean'] = pd.Series( [replaceKeyword(x, job_titles ) for x in df['Job Title']])
    df['JobTitleClean'] = df['JobTitleClean'].fillna("") # replace with "other" for all jobs not in job_titles
    df['Requirment of the company '] = df['Requirment of the company '].fillna('Unknown Company')
    df['Experience level'] = df['Experience level'].fillna('Unknown Experience Level')
    df['posting'] = df['JobTitleClean'] + " " + df['Requirment of the company '] + " "  + df['Experience level']  # + " " + df['Company'] 
    df['posting'] = df['posting'].str.replace(",", " ")
    df['posting'] = df['posting'].str.replace("-", "")
    docs = df['posting'].to_list()
    return docs


def lemmatize(docs, allowed_postags = ["NOUN", "ADJ", "VERB", "ADV"]):
    '''
    Input:
    - docs: list of strings representing input documents
    - allowed_postags: list of accepted Part of Speech (POS) types
    Output:
    - list of strings with lemmatized input
    '''
    nlp = spacy.load("en_core_web_sm", disable = ["parser", "ner"])
    lemmatized_docs = []
    for doc in docs:
      # print(doc, "Type: ", type(doc))
      doc = nlp(doc)
      tokens = []
      for token in doc:
        if token.pos_ in allowed_postags:
          tokens.append(token.lemma_)
      lemmatized_docs.append(" ".join(tokens))
    return (lemmatized_docs)


def tokenize(docs):
    '''
    Input:
    - docs: list of strings representing input documents
    Output:
    - list of strings with tokenized input
    '''
    tokenized_docs = []
    for doc in docs:
      tokens = gensim.utils.simple_preprocess(doc, deacc=True)
      # tokens = [list(tokenize(s)) for s in docs]
      tokenized_docs.append(tokens)
    return (tokenized_docs)


In [3]:
build_corpus_time = time.time()

df = pd.read_csv('ai-jobs_data_science_job.csv', encoding = 'latin-1')

docs = simplifyJobs(df)
preprocess_start = time.time()
# lemmatized_docs = lemmatize(docs)
tokenized_docs = tokenize(docs)

end_preprocess = time.time()
print("Total time to preprocess: ", end_preprocess - preprocess_start)

type(docs[0])

Total time to preprocess:  0.04502296447753906


str

In [4]:
docs[16]

'MachineLearningEngineer AWS CUDA Docker Git GitHub GitLab Seniorlevel'

In [5]:
tokenized_docs[912]

['dataengineer',
 'engineering',
 'machine',
 'learning',
 'research',
 'seniorlevel']

In [6]:

# Mapping from word IDs to words
preprocess_start = time.time()
id2word = corpora.Dictionary(tokenized_docs)
end_preprocess = time.time()
print("Total time to map ids to words: ", end_preprocess - preprocess_start)


# Prepare Document-Term Matrix
corpus = []
for doc in tokenized_docs:
    corpus.append(id2word.doc2bow(doc))
# coherence_model_lda = CoherenceModel(model=topic_model, texts=corpus, dictionary=id2word, coherence='c_v')


# coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')

end_preprocess = time.time()
print("Total time to build corpus: ", end_preprocess - build_corpus_time)
corpus

Total time to map ids to words:  0.021539926528930664
Total time to build corpus:  0.20020556449890137


[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1)],
 [(1, 1), (2, 1), (3, 1), (10, 1), (11, 1), (12, 1), (13, 1)],
 [(0, 2),
  (8, 1),
  (10, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1)],
 [(2, 1), (3, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1)],
 [(0, 1),
  (1, 1),
  (8, 1),
  (15, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1)],
 [(0, 1),
  (8, 1),
  (16, 1),
  (18, 2),
  (24, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1)],
 [(1, 1), (3, 1), (6, 1), (13, 1), (38, 1), (39, 1)],
 [(3, 1), (4, 1), (39, 1), (40, 1), (41, 1)],
 [(0, 1),
  (1, 1),
  (3, 1),
  (5, 1),
  (8, 1),
  (18, 1),
  (23, 1),
  (29, 1),
  (34, 1),
  (42, 1),
  (43, 1)],
 [(0, 1),
  (1, 1),
  (8, 1),
  (10, 1),
  (15, 1),
  (27, 1),
  (29, 1),
  (37, 1),
  (42, 1),
  (44, 1)],
 [(0, 1),
  (1, 1),
  (3, 1),
  (8, 1),
  (16, 1),
  (18, 2),
  (33, 1),
  (34, 1),
  (45, 1),
  (46, 1),
  (4

In [7]:

start_model_fit = time.time()
topic_model = gensim.models.ldamodel.LdaModel(
    corpus = corpus,      # Document-Term Matrix
    id2word = id2word,    # Map word IDs to words
    num_topics = 6,      # Number of latent topics to extract
    random_state = 100,
    passes = 200,         # Number of passes in the corpus during training
    )
topic_model

end_preprocess = time.time()
print("Total time to fit lda model: ", end_preprocess - start_model_fit)


Total time to fit lda model:  76.88726758956909


In [8]:
topic_model.print_topics()

[(0,
  '0.241*"data" + 0.066*"management" + 0.065*"quality" + 0.057*"level" + 0.057*"experience" + 0.057*"unknown" + 0.043*"engineering" + 0.035*"etl" + 0.025*"governance" + 0.025*"agile"'),
 (1,
  '0.103*"excel" + 0.066*"finance" + 0.064*"entrylevel" + 0.057*"python" + 0.050*"seniorlevel" + 0.047*"dataanalyst" + 0.046*"power" + 0.046*"bi" + 0.045*"sql" + 0.042*"research"'),
 (2,
  '0.144*"learning" + 0.107*"machine" + 0.079*"engineering" + 0.067*"seniorlevel" + 0.042*"computer" + 0.036*"deep" + 0.035*"science" + 0.024*"entrylevel" + 0.022*"mathematics" + 0.021*"python"'),
 (3,
  '0.249*"data" + 0.072*"analysis" + 0.060*"computer" + 0.060*"science" + 0.060*"seniorlevel" + 0.055*"business" + 0.050*"analytics" + 0.047*"intelligence" + 0.035*"big" + 0.032*"visualization"'),
 (4,
  '0.146*"computer" + 0.113*"science" + 0.083*"seniorlevel" + 0.049*"engineering" + 0.042*"architecture" + 0.036*"vision" + 0.033*"aws" + 0.031*"classification" + 0.027*"deep" + 0.026*"learning"'),
 (5,
  '0.118*"

In [9]:
coherence_model_lda = CoherenceModel(model=topic_model, texts=tokenized_docs, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print("LDA Coherence C_V is: {0}".format(coherence_lda))

LDA Coherence C_V is: 0.3252468215595967


In [10]:
for key in id2word.keys():
    print(id2word[key])

computer
data
dataanalyst
entrylevel
genetics
mathematics
quality
sas
science
statistics
agile
finance
management
security
architecture
aws
deep
experience
learning
level
unknown
vision
bi
engineering
industrial
oracle
power
azure
consulting
dataengineer
dataflow
midlevel
pipelines
datascientist
machine
nlp
numpy
seniorlevel
banking
excel
business
intelligence
big
matlab
apis
analysis
keras
phd
analytics
economics
market
privacy
research
spark
bigquery
llms
pandas
python
cuda
docker
git
github
gitlab
redshift
bianalyst
looker
warehouse
metabase
visualization
ecommerce
airflow
clustering
governance
mining
blockchain
databricks
sql
cd
ci
devops
etl
crypto
fraud
risk
tableau
cybernetics
datarobot
angular
javascript
testing
warehousing
kpis
ml
models
classification
bigtable
distributed
systems
gcp
generative
modeling
dataops
hadoop
dagster
physics
causal
inference
cx
autonomous
driving
linux
architect
matplotlib
mongodb
jira
react
executivelevel
firm
mysql
gans
athena
cassandra
elt
snowfla

In [11]:
# get topics 
job_titles=['DataAnalyst','DataEngineer','DataScientist','MachineLearningEngineer', 'BIAnalyst', 
            'AIEngineer','SoftwareEngineer','DevOpsEngineer','Architect','BigDataEngineer', 'ResearchScientist']

word2id = {v: k for k, v in id2word.items()}

for title in job_titles:
    try:
        title = title.lower()
        print("Closest topics to ", title, word2id[title], ":")
        print(topic_model.get_term_topics(word2id[title]))
        topics = topic_model.get_term_topics(word2id[title], minimum_probability = 1e-8)
        print()
    except:
        print("Title not in list")
# 

Closest topics to  dataanalyst 2 :
[(0, 0.0133936005), (1, 0.046531532), (3, 0.012887663), (4, 0.012090275), (5, 0.01719847)]

Closest topics to  dataengineer 29 :
[(0, 0.017736), (1, 0.014463523), (2, 0.015453741), (3, 0.012817686), (4, 0.014611066), (5, 0.033198867)]

Closest topics to  datascientist 33 :
[(1, 0.013213822), (2, 0.018274916), (3, 0.0123016), (4, 0.013930283), (5, 0.012267215)]

Title not in list
Closest topics to  bianalyst 64 :
[]

Closest topics to  aiengineer 180 :
[]

Title not in list
Closest topics to  devopsengineer 207 :
[]

Closest topics to  architect 111 :
[]

Title not in list
Title not in list


In [12]:

start_vis = time.time()

pyLDAvis.enable_notebook()
visualization = pyLDAvis.gensim_models.prepare(
    topic_model, 
    corpus,
    id2word, 
    mds = "mmds", 
    R = 15)

end_preprocess = time.time()
print("Total time to visualize model: ", end_preprocess - start_vis)

visualization

Total time to visualize model:  1.8049423694610596


In [13]:
topic_model 

<gensim.models.ldamodel.LdaModel at 0x1d724530130>

In [14]:
visualization