<a href="https://colab.research.google.com/github/indranildchandra/JD_Keywords_Extractor/blob/master/src/Topic_Modeling_on_Job_Descriptions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

#upload online-job-posts.csv file for training

In [45]:
!pip install pyLDAvis
!pip install paramiko



In [0]:
import spacy
import nltk
import random
import pickle
import gensim
import pandas as pd
import pyLDAvis.gensim
from spacy.lang.en import English
from spacy import displacy
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora

In [47]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
parser = spacy.load('en')

In [0]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.is_punct:
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('at')
        elif token.pos_ == "ADJ" or token.pos_ == "VERB" or token.pos_ == "RBR" or token.pos_ == "RBS" or token.pos_ == "RB" or token.pos_ == "RP":
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [0]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [68]:
for w in ['ran', 'happier', 'charging']:
    print(w, get_lemma(w), get_lemma2(w))

ran run ran
happier happy happier
charging charge charging


In [0]:
en_stop = set(nltk.corpus.stopwords.words('english'))

In [70]:
#[["jobpost","date","Title","Company","AnnouncementCode","Term","Eligibility","Audience","StartDate","Duration","Location","JobDescription","JobRequirement","RequiredQual","Salary","ApplicationP","OpeningDate","Deadline","Notes","AboutC","Attach","Year","Month","IT"]]
raw_data = pd.read_csv("online-job-posts.csv")
raw_data.fillna("", inplace = True)

print(raw_data.head(2))

                                             jobpost         date  \
0  AMERIA Investment Consulting Company\r\nJOB TI...  Jan 5, 2004   
1  International Research & Exchanges Board (IREX...  Jan 7, 2004   

                                               Title  \
0                            Chief Financial Officer   
1  Full-time Community Connections Intern (paid i...   

                                           Company AnnouncementCode Term  \
0             AMERIA Investment Consulting Company                         
1  International Research & Exchanges Board (IREX)                         

  Eligibility Audience StartDate  Duration  ...   Salary  \
0                                           ...            
1                                 3 months  ...            

                                        ApplicationP OpeningDate   Deadline  \
0  To apply for this position, please submit a\r\...              26-Jan-04   
1  Please submit a cover letter and resume to:\r\...   

In [0]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 3]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [0]:
job_description_data = []
job_requirement_data = []
required_qualification_data = []

for index, entry in raw_data.iterrows():
  job_description_tokens = prepare_text_for_lda(str(entry["JobDescription"]).strip())
  job_requirement_tokens = prepare_text_for_lda(str(entry["JobRequirement"]).strip())
  required_qualification_tokens = prepare_text_for_lda(str(entry["RequiredQual"]).strip())
  
  if not len(job_description_tokens) == 0:
    job_description_data.append(job_description_tokens)
  if not len(job_requirement_tokens) == 0:
    job_requirement_data.append(job_requirement_tokens)
  if not len(required_qualification_tokens) == 0:
    required_qualification_data.append(required_qualification_tokens)

In [0]:
job_description_dictionary = corpora.Dictionary(job_description_data)
job_requirement_dictionary = corpora.Dictionary(job_requirement_data)
required_qualification_dictionary = corpora.Dictionary(required_qualification_data)

In [0]:
job_description_corpus = [job_description_dictionary.doc2bow(text) for text in job_description_data]
job_requirement_corpus = [job_requirement_dictionary.doc2bow(text) for text in job_requirement_data]
required_qualification_corpus = [required_qualification_dictionary.doc2bow(text) for text in required_qualification_data]

In [0]:
pickle.dump(job_description_corpus, open('job_description_corpus.pkl', 'wb'))
job_description_dictionary.save('job_description_dictionary.gensim')

pickle.dump(job_requirement_corpus, open('job_requirement_corpus.pkl', 'wb'))
job_requirement_dictionary.save('job_requirement_dictionary.gensim')

pickle.dump(required_qualification_corpus, open('required_qualification_corpus.pkl', 'wb'))
required_qualification_dictionary.save('required_qualification_dictionary.gensim')

In [0]:
JOB_DESCRIPTION_NUM_TOPICS = 20
job_description_ldamodel = gensim.models.ldamodel.LdaModel(job_description_corpus, num_topics = JOB_DESCRIPTION_NUM_TOPICS, id2word=job_description_dictionary, passes=50)
job_description_ldamodel.save('job_description_model.gensim')

JOB_REQUIREMENT_NUM_TOPICS = 20
job_requirement_ldamodel = gensim.models.ldamodel.LdaModel(job_requirement_corpus, num_topics = JOB_REQUIREMENT_NUM_TOPICS, id2word=job_requirement_dictionary, passes=50)
job_requirement_ldamodel.save('job_requirement_model.gensim')

REQUIRED_QUALIFICATION_NUM_TOPICS = 20
required_qualification_ldamodel = gensim.models.ldamodel.LdaModel(required_qualification_corpus, num_topics = REQUIRED_QUALIFICATION_NUM_TOPICS, id2word=required_qualification_dictionary, passes=50)
required_qualification_ldamodel.save('required_qualification_model.gensim')

In [94]:
job_description_topics = job_description_ldamodel.print_topics(num_words=10)
print("Job Description Topics: ")
for topic in job_description_topics:
    print(topic)

job_requirement_topics = job_requirement_ldamodel.print_topics(num_words=10)
print("\nJob Requirement Topics: ")
for topic in job_requirement_topics:
    print(topic)

required_qualification_topics = required_qualification_ldamodel.print_topics(num_words=10)
print("\nRequired Qualification Topics: ")
for topic in required_qualification_topics:
    print(topic)

Job Description Topics: 
(0, '0.101*"sales" + 0.077*"marketing" + 0.057*"company" + 0.050*"product" + 0.043*"manager" + 0.034*"incumbent" + 0.023*"strategy" + 0.021*"market" + 0.018*"customer" + 0.016*"business"')
(1, '0.044*"accounting" + 0.039*"company" + 0.031*"finance" + 0.030*"accountant" + 0.029*"incumbent" + 0.029*"chief" + 0.022*"management" + 0.019*"audit" + 0.019*"procedure" + 0.018*"activity"')
(2, '0.054*"language" + 0.041*"data" + 0.028*"english" + 0.024*"analyst" + 0.021*"activity" + 0.021*"chain" + 0.020*"lawyer" + 0.019*"translation" + 0.017*"fund" + 0.016*"information"')
(3, '0.061*"training" + 0.046*"community" + 0.032*"development" + 0.023*"program" + 0.021*"capacity" + 0.018*"society" + 0.018*"implementation" + 0.018*"government" + 0.017*"evaluation" + 0.017*"analysis"')
(4, '0.088*"system" + 0.051*"incumbent" + 0.040*"network" + 0.034*"database" + 0.032*"support" + 0.026*"maintenance" + 0.025*"company" + 0.024*"administrator" + 0.018*"server" + 0.016*"security"')
(

In [99]:
new_doc = "Investment Consulting Company is seeking a Chief Financial Officer. This position manages the company's fiscal and administrative functions, provides highly responsible and technically complex staff assistance to the Executive Director. The work performed requires a high level of technical proficiency in financial management and investment management, as well as management, supervisory and administrative skills."
document = parser(new_doc)

print("Noun Phrases...")
print("--- [Format: Noun Phrase -> Root Text] ---")
for noun_phrase in document.noun_chunks:
  #print(noun_phrase.text, noun_phrase.label_, noun_phrase.root.text)
  print(noun_phrase.text + " -> " + noun_phrase.root.text)

new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = job_description_dictionary.doc2bow(new_doc)
print("\nTopics relevant to new document are: ")
print(job_description_ldamodel.get_document_topics(new_doc_bow, minimum_probability=0.1))

Noun Phrases...
--- [Format: Noun Phrase -> Root Text] ---
Investment Consulting Company -> Company
a Chief Financial Officer -> Officer
This position -> position
the company's fiscal and administrative functions -> functions
highly responsible and technically complex staff assistance -> assistance
the Executive Director -> Director
The work -> work
a high level -> level
technical proficiency -> proficiency
financial management and investment management -> management
management -> management
skills -> skills

Topics relevant to new document are: 
[(1, 0.19130743), (2, 0.11778699), (12, 0.33761704), (14, 0.23294441)]


In [100]:
job_description_dictionary = gensim.corpora.Dictionary.load('job_description_dictionary.gensim')
job_description_corpus = pickle.load(open('job_description_corpus.pkl', 'rb'))
job_description_ldamodel = gensim.models.ldamodel.LdaModel.load('job_description_model.gensim')

job_description_lda_display = pyLDAvis.gensim.prepare(job_description_ldamodel, job_description_corpus, job_description_dictionary, sort_topics=True, mds='mmds')
pyLDAvis.display(job_description_lda_display)

  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [101]:
job_requirement_dictionary = gensim.corpora.Dictionary.load('job_requirement_dictionary.gensim')
job_requirement_corpus = pickle.load(open('job_requirement_corpus.pkl', 'rb'))
job_requirement_ldamodel = gensim.models.ldamodel.LdaModel.load('job_requirement_model.gensim')

job_requirement_lda_display = pyLDAvis.gensim.prepare(job_requirement_ldamodel, job_requirement_corpus, job_requirement_dictionary, sort_topics=True, mds='mmds')
pyLDAvis.display(job_requirement_lda_display)

  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [102]:
required_qualification_dictionary = gensim.corpora.Dictionary.load('required_qualification_dictionary.gensim')
required_qualification_corpus = pickle.load(open('required_qualification_corpus.pkl', 'rb'))
required_qualification_ldamodel = gensim.models.ldamodel.LdaModel.load('required_qualification_model.gensim')

required_qualification_lda_display = pyLDAvis.gensim.prepare(required_qualification_ldamodel, required_qualification_corpus, required_qualification_dictionary, sort_topics=True, mds='mmds')
pyLDAvis.display(required_qualification_lda_display)

  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [103]:
!ls -ltr

total 113064
drwxr-xr-x 1 root root     4096 Apr  4 20:20 sample_data
-rw-r--r-- 1 root root 95435519 Apr 14 15:37 online-job-posts.csv
-rw-r--r-- 1 root root  2316149 Apr 14 19:07 job_description_corpus.pkl
-rw-r--r-- 1 root root   189635 Apr 14 19:07 job_description_dictionary.gensim
-rw-r--r-- 1 root root  5715308 Apr 14 19:07 job_requirement_corpus.pkl
-rw-r--r-- 1 root root   251569 Apr 14 19:07 job_requirement_dictionary.gensim
-rw-r--r-- 1 root root  5714777 Apr 14 19:07 required_qualification_corpus.pkl
-rw-r--r-- 1 root root   199129 Apr 14 19:07 required_qualification_dictionary.gensim
-rw-r--r-- 1 root root   864084 Apr 14 19:11 job_description_model.gensim.state
-rw-r--r-- 1 root root   248274 Apr 14 19:11 job_description_model.gensim.id2word
-rw-r--r-- 1 root root   596448 Apr 14 19:11 job_description_model.gensim.expElogbeta.npy
-rw-r--r-- 1 root root    49681 Apr 14 19:11 job_description_model.gensim
-rw-r--r-- 1 root root  1140285 Apr 14 19:16 job_requirement_model.gens

In [0]:
files.download("job_description_corpus.pkl")
files.download("job_description_dictionary.gensim")
files.download("job_description_model.gensim")
files.download("job_description_model.gensim.state")
files.download("job_description_model.gensim.id2word")
files.download("job_description_model.gensim.expElogbeta.npy")

files.download("job_requirement_corpus.pkl")
files.download("job_requirement_dictionary.gensim")
files.download("job_requirement_model.gensim")
files.download("job_requirement_model.gensim.state")
files.download("job_requirement_model.gensim.id2word")
files.download("job_requirement_model.gensim.expElogbeta.npy")

files.download("required_qualification_corpus.pkl")
files.download("required_qualification_dictionary.gensim")
files.download("required_qualification_model.gensim")
files.download("required_qualification_model.gensim.state")
files.download("required_qualification_model.gensim.id2word")
files.download("required_qualification_model.gensim.expElogbeta.npy")