<a href="https://colab.research.google.com/github/indranildchandra/JD_Keywords_Extractor/blob/master/src/Topic_Modeling_on_Job_Descriptions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

#upload online-job-posts.zip file for training

Saving online-job-posts.zip to online-job-posts.zip
User uploaded file "online-job-posts.zip" with length 13254376 bytes


In [0]:
!unzip online-job-posts.zip

Archive:  online-job-posts.zip
  inflating: online-job-posts.csv    


In [0]:
!ls -ltr

total 106148
-rw-r--r-- 1 root root 95435519 Apr  3 03:12 online-job-posts.csv
drwxr-xr-x 1 root root     4096 Apr  4 20:20 sample_data
-rw-r--r-- 1 root root 13254376 Apr 23 11:50 online-job-posts.zip


In [0]:
!pip install pyLDAvis
!pip install paramiko



In [0]:
import spacy
import nltk
import random
import pickle
import gensim
import pandas as pd
import pyLDAvis.gensim

from spacy.lang.en import English
from spacy import displacy
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, models

In [0]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=PendingDeprecationWarning)
warnings.filterwarnings('ignore', category=ResourceWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

In [0]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
parser = spacy.load('en')

In [0]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.is_punct:
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('at')
        elif token.pos_ == "ADJ" or token.pos_ == "VERB" or token.pos_ == "RBR" or token.pos_ == "RBS" or token.pos_ == "RB" or token.pos_ == "RP":
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [0]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [0]:
for w in ['ran', 'happier', 'charging']:
    print(w, get_lemma(w), get_lemma2(w))

ran run ran
happier happy happier
charging charge charging


In [0]:
en_stop = set(nltk.corpus.stopwords.words('english'))

In [0]:
#[["jobpost","date","Title","Company","AnnouncementCode","Term","Eligibility","Audience","StartDate","Duration","Location","JobDescription","JobRequirement","RequiredQual","Salary","ApplicationP","OpeningDate","Deadline","Notes","AboutC","Attach","Year","Month","IT"]]
raw_data = pd.read_csv("online-job-posts.csv")
raw_data.fillna("", inplace = True)

print(raw_data.head(2))

                                             jobpost         date  \
0  AMERIA Investment Consulting Company\r\nJOB TI...  Jan 5, 2004   
1  International Research & Exchanges Board (IREX...  Jan 7, 2004   

                                               Title  \
0                            Chief Financial Officer   
1  Full-time Community Connections Intern (paid i...   

                                           Company AnnouncementCode Term  \
0             AMERIA Investment Consulting Company                         
1  International Research & Exchanges Board (IREX)                         

  Eligibility Audience StartDate  Duration  ... Salary  \
0                                           ...          
1                                 3 months  ...          

                                        ApplicationP OpeningDate   Deadline  \
0  To apply for this position, please submit a\r\...              26-Jan-04   
1  Please submit a cover letter and resume to:\r\...         

In [0]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 3]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [0]:
job_description_data = []
job_requirement_data = []
required_qualification_data = []

for index, entry in raw_data.iterrows():
  job_description_tokens = prepare_text_for_lda(str(entry["JobDescription"]).strip())
  job_requirement_tokens = prepare_text_for_lda(str(entry["JobRequirement"]).strip())
  required_qualification_tokens = prepare_text_for_lda(str(entry["RequiredQual"]).strip())
  
  if not len(job_description_tokens) == 0:
    job_description_data.append(job_description_tokens)
  if not len(job_requirement_tokens) == 0:
    job_requirement_data.append(job_requirement_tokens)
  if not len(required_qualification_tokens) == 0:
    required_qualification_data.append(required_qualification_tokens)

In [0]:
job_description_dictionary = corpora.Dictionary(job_description_data)
job_requirement_dictionary = corpora.Dictionary(job_requirement_data)
required_qualification_dictionary = corpora.Dictionary(required_qualification_data)

**Filter out tokens that appear in -**

*   less than 15 documents (absolute number)
*   more than 0.5 documents (fraction of total corpus size, not absolute number)
*   after the above two steps, keep only the first 100000 most frequent tokens




In [0]:
# job_description_dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
# job_requirement_dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
# required_qualification_dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

**Print 10 example words from each of the three dictionaries formed**

In [0]:
count = 0
print("{} words present in job_description_dictionary.".format(len(job_description_dictionary)))
print("Examples from job_description_dictionary...")
for k, v in job_description_dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break
        
count = 0
print("\n{} words present in job_requirement_dictionary.".format(len(job_requirement_dictionary)))
print("Examples from job_requirement_dictionary...")
for k, v in job_requirement_dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break
        
count = 0
print("\n{} words present in required_qualification_dictionary.".format(len(required_qualification_dictionary)))
print("Examples from required_qualification_dictionary...")
for k, v in required_qualification_dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

7454 words present in job_description_dictionary.
Examples from job_description_dictionary...
0 ameria
1 assistance
2 chief
3 company
4 consult
5 director
6 executive
7 financial
8 function
9 highly
10 investment

9821 words present in job_requirement_dictionary.
Examples from job_requirement_dictionary...
0 accounting
1 action
2 activity
3 adequacy
4 administration
5 article
6 assigning
7 assist
8 board
9 budget
10 cash

7758 words present in required_qualification_dictionary.
Examples from required_qualification_dictionary...
0 ability
1 acca
2 accounting
3 accounting/
4 activity
5 administration
6 analysis
7 analytical
8 and/or
9 application
10 audience


In [0]:
job_description_bow_corpus = [job_description_dictionary.doc2bow(text) for text in job_description_data]
job_requirement_bow_corpus = [job_requirement_dictionary.doc2bow(text) for text in job_requirement_data]
required_qualification_bow_corpus = [required_qualification_dictionary.doc2bow(text) for text in required_qualification_data]

**Preview Bag of Words on a sample pre-processed document**

In [0]:
print("Examples from job_description_bow_corpus...")
for i in range(len(job_description_bow_corpus[10])):
  print("Word {} (\"{}\") appears {} time.".format(job_description_bow_corpus[10][i][0], job_description_dictionary[job_description_bow_corpus[10][i][0]], job_description_bow_corpus[10][i][1]))

print("\nExamples from job_requirement_bow_corpus...")
for i in range(len(job_requirement_bow_corpus[10])):
  print("Word {} (\"{}\") appears {} time.".format(job_requirement_bow_corpus[10][i][0], job_requirement_dictionary[job_requirement_bow_corpus[10][i][0]], job_requirement_bow_corpus[10][i][1]))

print("\nExamples from required_qualification_bow_corpus...")
for i in range(len(required_qualification_bow_corpus[10])):
  print("Word {} (\"{}\") appears {} time.".format(required_qualification_bow_corpus[10][i][0], required_qualification_dictionary[required_qualification_bow_corpus[10][i][0]], required_qualification_bow_corpus[10][i][1]))

Examples from job_description_bow_corpus...
Word 14 ("position") appears 1 time.
Word 149 ("communication") appears 1 time.
Word 150 ("concept") appears 1 time.
Word 151 ("design") appears 1 time.
Word 152 ("designer") appears 1 time.
Word 153 ("experience") appears 1 time.
Word 154 ("field") appears 1 time.
Word 155 ("graphic") appears 1 time.
Word 156 ("medium") appears 1 time.
Word 157 ("since") appears 1 time.
Word 158 ("study") appears 1 time.

Examples from job_requirement_bow_corpus...
Word 11 ("client") appears 1 time.
Word 25 ("development") appears 1 time.
Word 28 ("documentation") appears 1 time.
Word 64 ("product") appears 1 time.
Word 82 ("team") appears 1 time.
Word 111 ("communication") appears 1 time.
Word 123 ("group") appears 1 time.
Word 137 ("project") appears 1 time.
Word 164 ("software") appears 1 time.
Word 266 ("also") appears 1 time.
Word 267 ("designer") appears 1 time.
Word 268 ("every") appears 1 time.
Word 269 ("graphic") appears 1 time.
Word 270 ("guide") 

In [0]:
job_description_tfidf_model = models.TfidfModel(job_description_bow_corpus)
job_description_tfidf_corpus = job_description_tfidf_model[job_description_bow_corpus]

job_requirement_tfidf_model = models.TfidfModel(job_requirement_bow_corpus)
job_requirement_tfidf_corpus = job_requirement_tfidf_model[job_requirement_bow_corpus]

required_qualification_tfidf_model = models.TfidfModel(required_qualification_bow_corpus)
required_qualification_tfidf_corpus = required_qualification_tfidf_model[required_qualification_bow_corpus]

In [0]:
pickle.dump(job_description_bow_corpus, open('job_description_bow_corpus.pkl', 'wb'))
pickle.dump(job_description_tfidf_corpus, open('job_description_tfidf_corpus.pkl', 'wb'))
job_description_dictionary.save('job_description_dictionary.gensim')

pickle.dump(job_requirement_bow_corpus, open('job_requirement_bow_corpus.pkl', 'wb'))
pickle.dump(job_requirement_tfidf_corpus, open('job_requirement_tfidf_corpus.pkl', 'wb'))
job_requirement_dictionary.save('job_requirement_dictionary.gensim')

pickle.dump(required_qualification_bow_corpus, open('required_qualification_bow_corpus.pkl', 'wb'))
pickle.dump(required_qualification_tfidf_corpus, open('required_qualification_tfidf_corpus.pkl', 'wb'))
required_qualification_dictionary.save('required_qualification_dictionary.gensim')

In [0]:
!grep -c ^processor /proc/cpuinfo
# Set this value to NUM_OF_CORES

2


In [0]:
NUM_OF_CORES = 2

In [0]:
JOB_DESCRIPTION_NUM_TOPICS = 20
job_description_bow_lda_model = gensim.models.LdaMulticore(job_description_bow_corpus, num_topics = JOB_DESCRIPTION_NUM_TOPICS, id2word=job_description_dictionary, passes=100, workers=NUM_OF_CORES)
job_description_bow_lda_model.save('job_description_bow_lda_model.gensim')
job_description_tfidf_lda_model = gensim.models.LdaMulticore(job_description_tfidf_corpus, num_topics = JOB_DESCRIPTION_NUM_TOPICS, id2word=job_description_dictionary, passes=100, workers=NUM_OF_CORES)
job_description_tfidf_lda_model.save('job_description_tfidf_lda_model.gensim')

In [0]:
JOB_REQUIREMENT_NUM_TOPICS = 20
job_requirement_bow_lda_model = gensim.models.LdaMulticore(job_requirement_bow_corpus, num_topics = JOB_REQUIREMENT_NUM_TOPICS, id2word=job_requirement_dictionary, passes=100, workers=NUM_OF_CORES)
job_requirement_bow_lda_model.save('job_requirement_bow_lda_model.gensim')
job_requirement_tfidf_lda_model = gensim.models.LdaMulticore(job_requirement_tfidf_corpus, num_topics = JOB_REQUIREMENT_NUM_TOPICS, id2word=job_requirement_dictionary, passes=100, workers=NUM_OF_CORES)
job_requirement_tfidf_lda_model.save('job_requirement_tfidf_lda_model.gensim')

In [0]:
REQUIRED_QUALIFICATION_NUM_TOPICS = 20
required_qualification_bow_lda_model = gensim.models.LdaMulticore(required_qualification_bow_corpus, num_topics = REQUIRED_QUALIFICATION_NUM_TOPICS, id2word=required_qualification_dictionary, passes=100, workers=NUM_OF_CORES)
required_qualification_bow_lda_model.save('required_qualification_bow_lda_model.gensim')
required_qualification_tfidf_lda_model = gensim.models.LdaMulticore(required_qualification_tfidf_corpus, num_topics = REQUIRED_QUALIFICATION_NUM_TOPICS, id2word=required_qualification_dictionary, passes=100, workers=NUM_OF_CORES)
required_qualification_tfidf_lda_model.save('required_qualification_tfidf_lda_model.gensim')

In [0]:
job_description_bow_topics = job_description_bow_lda_model.print_topics(num_words=10)
print("Job Description Topics (BoW): ")
for idx, topic in job_description_bow_topics:
    print('Topic: {} \nWords: {}'.format(idx, topic))

job_description_tfidf_topics = job_description_tfidf_lda_model.print_topics(num_words=10)
print("\n\nJob Description Topics (TF-IDF): ")
for idx, topic in job_description_tfidf_topics:
    print('Topic: {} \nWords: {}'.format(idx, topic))

Job Description Topics (BoW): 
Topic: 0 
Words: 0.058*"center" + 0.056*"language" + 0.036*"specialist" + 0.036*"education" + 0.035*"armenia" + 0.031*"international" + 0.030*"student" + 0.025*"card" + 0.022*"english" + 0.019*"position"
Topic: 1 
Words: 0.067*"program" + 0.051*"training" + 0.031*"armenia" + 0.025*"yerevan" + 0.022*"position" + 0.021*"foundation" + 0.019*"coordinator" + 0.017*"country" + 0.015*"office" + 0.014*"staff"
Topic: 2 
Words: 0.079*"sales" + 0.045*"company" + 0.036*"product" + 0.033*"incumbent" + 0.031*"representative" + 0.028*"customer" + 0.028*"manager" + 0.025*"medical" + 0.023*"store" + 0.015*"services"
Topic: 3 
Words: 0.066*"system" + 0.041*"network" + 0.035*"maintenance" + 0.031*"engineer" + 0.030*"support" + 0.029*"database" + 0.028*"incumbent" + 0.028*"design" + 0.024*"administrator" + 0.021*"construction"
Topic: 4 
Words: 0.067*"bank" + 0.047*"credit" + 0.035*"specialist" + 0.030*"department" + 0.029*"cjsc" + 0.027*"loan" + 0.022*"incumbent" + 0.022*"he

In [0]:
job_requirement_bow_topics = job_requirement_bow_lda_model.print_topics(num_words=10)
print("Job Requirement Topics (BoW): ")
for idx, topic in job_requirement_bow_topics:
    print('Topic: {} \nWords: {}'.format(idx, topic))

job_requirement_tfidf_topics = job_requirement_tfidf_lda_model.print_topics(num_words=10)
print("\n\nJob Requirement Topics (TF-IDF): ")
for idx, topic in job_requirement_tfidf_topics:
    print('Topic: {} \nWords: {}'.format(idx, topic))

Job Requirement Topics (BoW): 
Topic: 0 
Words: 0.030*"program" + 0.024*"project" + 0.023*"development" + 0.021*"management" + 0.017*"staff" + 0.016*"implementation" + 0.016*"policy" + 0.015*"programme" + 0.014*"activity" + 0.014*"donor"
Topic: 1 
Words: 0.048*"test" + 0.048*"design" + 0.043*"software" + 0.035*"application" + 0.034*"development" + 0.025*"team" + 0.023*"testing" + 0.023*"code" + 0.018*"requirement" + 0.017*"documentation"
Topic: 2 
Words: 0.020*"store" + 0.018*"control" + 0.018*"safety" + 0.017*"quality" + 0.015*"guest" + 0.014*"standard" + 0.013*"equipment" + 0.013*"food" + 0.012*"work" + 0.012*"service"
Topic: 3 
Words: 0.057*"office" + 0.031*"meeting" + 0.030*"document" + 0.020*"correspondence" + 0.016*"support" + 0.016*"assist" + 0.015*"duty" + 0.015*"translation" + 0.013*"report" + 0.013*"english"
Topic: 4 
Words: 0.032*"customer" + 0.029*"information" + 0.025*"call" + 0.020*"vehicle" + 0.019*"answer" + 0.017*"order" + 0.017*"duty" + 0.015*"staff" + 0.015*"inquiry"

In [0]:
required_qualification_bow_topics = required_qualification_bow_lda_model.print_topics(num_words=10)
print("Required Qualification Topics (BoW): ")
for idx, topic in required_qualification_bow_topics:
    print('Topic: {} \nWords: {}'.format(idx, topic))

required_qualification_tfidf_topics = required_qualification_tfidf_lda_model.print_topics(num_words=10)
print("\n\nRequired Qualification Topics (TF-IDF): ")
for idx, topic in required_qualification_tfidf_topics:
    print('Topic: {} \nWords: {}'.format(idx, topic))

Required Qualification Topics (BoW): 
Topic: 0 
Words: 0.096*"knowledge" + 0.074*"experience" + 0.033*"language" + 0.029*"skill" + 0.026*"years" + 0.026*"development" + 0.024*"database" + 0.024*"plus" + 0.020*"java" + 0.019*"work"
Topic: 1 
Words: 0.052*"finance" + 0.051*"skill" + 0.043*"banking" + 0.036*"economics" + 0.031*"ability" + 0.027*"experience" + 0.024*"analysis" + 0.022*"business" + 0.020*"bank" + 0.019*"team"
Topic: 2 
Words: 0.163*"knowledge" + 0.079*"experience" + 0.064*"language" + 0.053*"work" + 0.046*"education" + 0.044*"years" + 0.035*"field" + 0.029*"least" + 0.027*"legislation" + 0.018*"banking"
Topic: 3 
Words: 0.074*"security" + 0.031*"information" + 0.024*"operation" + 0.023*"equipment" + 0.023*"safety" + 0.021*"vehicle" + 0.021*"level" + 0.019*"care" + 0.019*"candidate" + 0.017*"applicant"
Topic: 4 
Words: 0.068*"knowledge" + 0.055*"experience" + 0.049*"skill" + 0.034*"development" + 0.031*"software" + 0.029*"ability" + 0.027*"language" + 0.025*"degree" + 0.025*

In [0]:
new_doc = "Investment Consulting Company is seeking a Chief Financial Officer. This position manages the company's fiscal and administrative functions, provides highly responsible and technically complex staff assistance to the Executive Director. The work performed requires a high level of technical proficiency in financial management and investment management, as well as management, supervisory and administrative skills."
document = parser(new_doc)

print("Noun Phrases...")
print("--- [Format: Noun Phrase -> Root Text] ---")
for noun_phrase in document.noun_chunks:
  #print(noun_phrase.text, noun_phrase.label_, noun_phrase.root.text)
  print(noun_phrase.text + " -> " + noun_phrase.root.text)

new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = job_description_dictionary.doc2bow(new_doc)
new_doc_tfidf = job_description_tfidf_model[new_doc_bow]

# print("\nTopics relevant to new document are: ")
# print(job_description_bow_lda_model.get_document_topics(new_doc_bow, minimum_probability=0.1))

counter = 0
print("\n\nTopics relevant to new document are (BoW): ")
for index, score in sorted(job_description_bow_lda_model[new_doc_bow], key=lambda tup: -1*tup[1]):
  if counter == 0:
    print("Score: {}\t Topic: {}".format(score, job_description_bow_lda_model.print_topic(index, 10)))
    highest_score = score
    counter = counter + 1
  elif highest_score - score <= 0.3:
    print("Score: {}\t Topic: {}".format(score, job_description_bow_lda_model.print_topic(index, 10)))
    counter = counter + 1
  else:
    break

counter = 0
print("\n\nTopics relevant to new document are (TF-IDF): ")
for index, score in sorted(job_description_tfidf_lda_model[new_doc_tfidf], key=lambda tup: -1*tup[1]):
  if counter == 0:
    print("Score: {}\t Topic: {}".format(score, job_description_tfidf_lda_model.print_topic(index, 10)))
    highest_score = score
    counter = counter + 1
  elif highest_score - score <= 0.3:
    print("Score: {}\t Topic: {}".format(score, job_description_tfidf_lda_model.print_topic(index, 10)))
    counter = counter + 1
  else:
    break

Noun Phrases...
--- [Format: Noun Phrase -> Root Text] ---
Investment Consulting Company -> Company
a Chief Financial Officer -> Officer
This position -> position
the company's fiscal and administrative functions -> functions
highly responsible and technically complex staff assistance -> assistance
the Executive Director -> Director
The work -> work
a high level -> level
technical proficiency -> proficiency
financial management and investment management -> management
management -> management
skills -> skills


Topics relevant to new document are (BoW): 
Score: 0.43358665704727173	 Topic: 0.052*"office" + 0.049*"director" + 0.044*"management" + 0.043*"manager" + 0.034*"assistant" + 0.027*"supervision" + 0.026*"operations" + 0.022*"executive" + 0.022*"activity" + 0.022*"incumbent"
Score: 0.1840825229883194	 Topic: 0.029*"management" + 0.026*"incumbent" + 0.026*"analysis" + 0.023*"data" + 0.022*"procedure" + 0.022*"business" + 0.020*"risk" + 0.019*"report" + 0.018*"policy" + 0.018*"financ

In [0]:
job_description_dictionary = gensim.corpora.Dictionary.load('job_description_dictionary.gensim')
job_description_bow_corpus = pickle.load(open('job_description_bow_corpus.pkl', 'rb'))
job_description_bow_lda_model = gensim.models.ldamodel.LdaModel.load('job_description_bow_lda_model.gensim')
job_description_tfidf_corpus = pickle.load(open('job_description_tfidf_corpus.pkl', 'rb'))
job_description_tfidf_lda_model = gensim.models.ldamodel.LdaModel.load('job_description_tfidf_lda_model.gensim')

job_requirement_dictionary = gensim.corpora.Dictionary.load('job_requirement_dictionary.gensim')
job_requirement_bow_corpus = pickle.load(open('job_requirement_bow_corpus.pkl', 'rb'))
job_requirement_bow_lda_model = gensim.models.ldamodel.LdaModel.load('job_requirement_bow_lda_model.gensim')
job_requirement_tfidf_corpus = pickle.load(open('job_requirement_tfidf_corpus.pkl', 'rb'))
job_requirement_tfidf_lda_model = gensim.models.ldamodel.LdaModel.load('job_requirement_tfidf_lda_model.gensim')

required_qualification_dictionary = gensim.corpora.Dictionary.load('required_qualification_dictionary.gensim')
required_qualification_bow_corpus = pickle.load(open('required_qualification_bow_corpus.pkl', 'rb'))
required_qualification_bow_lda_model = gensim.models.ldamodel.LdaModel.load('required_qualification_bow_lda_model.gensim')
required_qualification_tfidf_corpus = pickle.load(open('required_qualification_tfidf_corpus.pkl', 'rb'))
required_qualification_tfidf_lda_model = gensim.models.ldamodel.LdaModel.load('required_qualification_tfidf_lda_model.gensim')

In [0]:
job_description_bow_lda_display = pyLDAvis.gensim.prepare(job_description_bow_lda_model, job_description_bow_corpus, job_description_dictionary, sort_topics=True, mds='mmds')
pyLDAvis.save_html(job_description_bow_lda_display, 'job_description_bow_lda.html')
pyLDAvis.display(job_description_bow_lda_display)

In [0]:
job_description_tfidf_lda_display = pyLDAvis.gensim.prepare(job_description_tfidf_lda_model, job_description_tfidf_corpus, job_description_dictionary, sort_topics=True, mds='mmds')
pyLDAvis.save_html(job_description_tfidf_lda_display, 'job_description_tfidf_lda.html')
pyLDAvis.display(job_description_tfidf_lda_display)

In [0]:
job_requirement_bow_lda_display = pyLDAvis.gensim.prepare(job_requirement_bow_lda_model, job_requirement_bow_corpus, job_requirement_dictionary, sort_topics=True, mds='mmds')
pyLDAvis.save_html(job_requirement_bow_lda_display, 'job_requirement_bow_lda.html')
pyLDAvis.display(job_requirement_bow_lda_display)

In [0]:
job_requirement_tfidf_lda_display = pyLDAvis.gensim.prepare(job_requirement_tfidf_lda_model, job_requirement_tfidf_corpus, job_requirement_dictionary, sort_topics=True, mds='mmds')
pyLDAvis.save_html(job_requirement_tfidf_lda_display, 'job_requirement_tfidf_lda.html')
pyLDAvis.display(job_requirement_tfidf_lda_display) 

In [0]:
required_qualification_bow_lda_display = pyLDAvis.gensim.prepare(required_qualification_bow_lda_model, required_qualification_bow_corpus, required_qualification_dictionary, sort_topics=True, mds='mmds')
pyLDAvis.save_html(required_qualification_bow_lda_display, 'required_qualification_bow_lda.html')
pyLDAvis.display(required_qualification_bow_lda_display)

In [0]:
required_qualification_tfidf_lda_display = pyLDAvis.gensim.prepare(required_qualification_tfidf_lda_model, required_qualification_tfidf_corpus, required_qualification_dictionary, sort_topics=True, mds='mmds')
pyLDAvis.save_html(required_qualification_tfidf_lda_display, 'required_qualification_tfidf_lda.html')
pyLDAvis.display(required_qualification_tfidf_lda_display)

In [0]:
!ls -ltr

total 147028
-rw-r--r-- 1 root root 95435519 Apr  3 03:12 online-job-posts.csv
drwxr-xr-x 1 root root     4096 Apr  4 20:20 sample_data
-rw-r--r-- 1 root root 13254376 Apr 23 11:50 online-job-posts.zip
-rw-r--r-- 1 root root  2316149 Apr 23 14:28 job_description_bow_corpus.pkl
-rw-r--r-- 1 root root  2607134 Apr 23 14:28 job_description_tfidf_corpus.pkl
-rw-r--r-- 1 root root   248274 Apr 23 14:28 job_description_dictionary.gensim
-rw-r--r-- 1 root root  5715308 Apr 23 14:28 job_requirement_bow_corpus.pkl
-rw-r--r-- 1 root root  6098902 Apr 23 14:28 job_requirement_tfidf_corpus.pkl
-rw-r--r-- 1 root root   329148 Apr 23 14:28 job_requirement_dictionary.gensim
-rw-r--r-- 1 root root  5714777 Apr 23 14:28 required_qualification_bow_corpus.pkl
-rw-r--r-- 1 root root  6017765 Apr 23 14:28 required_qualification_tfidf_corpus.pkl
-rw-r--r-- 1 root root   260200 Apr 23 14:28 required_qualification_dictionary.gensim
-rw-r--r-- 1 root root   865404 Apr 23 14:36 job_description_bow_lda_model.gen

In [0]:
files.download("job_description_dictionary.gensim")
files.download("job_description_bow_corpus.pkl")
files.download("job_description_tfidf_corpus.pkl")
files.download("job_description_bow_lda_model.gensim")
files.download("job_description_bow_lda_model.gensim.state")
files.download("job_description_bow_lda_model.gensim.id2word")
files.download("job_description_bow_lda_model.gensim.expElogbeta.npy")
files.download("job_description_tfidf_lda_model.gensim")
files.download("job_description_tfidf_lda_model.gensim.state")
files.download("job_description_tfidf_lda_model.gensim.id2word")
files.download("job_description_tfidf_lda_model.gensim.expElogbeta.npy")

files.download("job_requirement_dictionary.gensim")
files.download("job_requirement_bow_corpus.pkl")
files.download("job_requirement_tfidf_corpus.pkl")
files.download("job_requirement_bow_lda_model.gensim")
files.download("job_requirement_bow_lda_model.gensim.state")
files.download("job_requirement_bow_lda_model.gensim.id2word")
files.download("job_requirement_bow_lda_model.gensim.expElogbeta.npy")
files.download("job_requirement_tfidf_lda_model.gensim")
files.download("job_requirement_tfidf_lda_model.gensim.state")
files.download("job_requirement_tfidf_lda_model.gensim.id2word")
files.download("job_requirement_tfidf_lda_model.gensim.expElogbeta.npy")

files.download("required_qualification_dictionary.gensim")
files.download("required_qualification_bow_corpus.pkl")
files.download("required_qualification_tfidf_corpus.pkl")
files.download("required_qualification_bow_lda_model.gensim")
files.download("required_qualification_bow_lda_model.gensim.state")
files.download("required_qualification_bow_lda_model.gensim.id2word")
files.download("required_qualification_bow_lda_model.gensim.expElogbeta.npy")
files.download("required_qualification_tfidf_lda_model.gensim")
files.download("required_qualification_tfidf_lda_model.gensim.state")
files.download("required_qualification_tfidf_lda_model.gensim.id2word")
files.download("required_qualification_tfidf_lda_model.gensim.expElogbeta.npy")

files.download("job_description_bow_lda.html")
files.download("job_description_tfidf_lda.html")
files.download("job_requirement_bow_lda.html")
files.download("job_requirement_tfidf_lda.html")
files.download("required_qualification_bow_lda.html")
files.download("required_qualification_tfidf_lda.html")