<a href="https://colab.research.google.com/github/indranildchandra/JD_Keywords_Extractor/blob/master/src/Topic_Modeling_on_Job_Descriptions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Upload Dataset

In [1]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

#upload online-job-posts.zip file for training

Saving online-job-posts.zip to online-job-posts.zip
User uploaded file "online-job-posts.zip" with length 13254376 bytes


In [2]:
!unzip online-job-posts.zip

Archive:  online-job-posts.zip
  inflating: online-job-posts.csv    


In [3]:
!ls -ltr

total 106148
-rw-r--r-- 1 root root 95435519 Apr  3 03:12 online-job-posts.csv
drwxr-xr-x 1 root root     4096 Apr  4 20:20 sample_data
-rw-r--r-- 1 root root 13254376 May  1 05:47 online-job-posts.zip


# Install required dependencies

In [4]:
!pip install pyLDAvis
!pip install paramiko

Collecting pyLDAvis
[?25l  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
[K    100% |████████████████████████████████| 1.6MB 19.4MB/s 
Collecting funcy (from pyLDAvis)
  Downloading https://files.pythonhosted.org/packages/b3/23/d1f90f4e2af5f9d4921ab3797e33cf0503e3f130dd390a812f3bf59ce9ea/funcy-1.12-py2.py3-none-any.whl
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/98/71/24/513a99e58bb6b8465bae4d2d5e9dba8f0bef8179e3051ac414
Successfully built pyLDAvis
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-1.12 pyLDAvis-2.1.2
Collecting paramiko
[?25l  Downloading https://files.pythonhosted.org/packages/cf/ae/94e70d49044ccc234bfdba20114fa947d7ba6eb68a2e452d89b920e62227/paramiko-2.4.2-py2.py3-none-any.whl (193kB)
[K    100% |████████████████████████████

# Import dependencies

In [0]:
import spacy
import nltk
import random
import pickle
import gensim
import numpy
import pyLDAvis.gensim
import pandas as pd
import matplotlib.pyplot as plt

from spacy.lang.en import English
from spacy import displacy
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, models
from gensim.models import Phrases
from gensim.models import Word2Vec
from sklearn.manifold import TSNE

%matplotlib inline

# Ignore warnings

In [0]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=PendingDeprecationWarning)
warnings.filterwarnings('ignore', category=ResourceWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# Download NLTK datasets

In [7]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Define Tokenizer

In [0]:
parser = spacy.load('en')

In [0]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.is_punct:
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('at')
        elif token.pos_ == "ADJ" or token.pos_ == "VERB" or token.pos_ == "RBR" or token.pos_ == "RBS" or token.pos_ == "RB" or token.pos_ == "RP":
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [0]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [11]:
for w in ['ran', 'happier', 'charging']:
    print(w, get_lemma(w), get_lemma2(w))

ran run ran
happier happy happier
charging charge charging


In [0]:
en_stop = set(nltk.corpus.stopwords.words('english'))

# Read Data

In [13]:
#[["jobpost","date","Title","Company","AnnouncementCode","Term","Eligibility","Audience","StartDate","Duration","Location","JobDescription","JobRequirement","RequiredQual","Salary","ApplicationP","OpeningDate","Deadline","Notes","AboutC","Attach","Year","Month","IT"]]
raw_data = pd.read_csv("online-job-posts.csv")
raw_data.fillna("", inplace = True)

print(raw_data.head(2))

                                             jobpost         date  \
0  AMERIA Investment Consulting Company\r\nJOB TI...  Jan 5, 2004   
1  International Research & Exchanges Board (IREX...  Jan 7, 2004   

                                               Title  \
0                            Chief Financial Officer   
1  Full-time Community Connections Intern (paid i...   

                                           Company AnnouncementCode Term  \
0             AMERIA Investment Consulting Company                         
1  International Research & Exchanges Board (IREX)                         

  Eligibility Audience StartDate  Duration  ... Salary  \
0                                           ...          
1                                 3 months  ...          

                                        ApplicationP OpeningDate   Deadline  \
0  To apply for this position, please submit a\r\...              26-Jan-04   
1  Please submit a cover letter and resume to:\r\...         

# Prepare Data

In [0]:
def prepare_text(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 3]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

# Tokenize Data

In [15]:
%%time

job_description_data = []
job_requirement_data = []
required_qualification_data = []

for index, entry in raw_data.iterrows():
  job_description_tokens = prepare_text(str(entry["JobDescription"]).strip())
  job_requirement_tokens = prepare_text(str(entry["JobRequirement"]).strip())
  required_qualification_tokens = prepare_text(str(entry["RequiredQual"]).strip())
  
  if not len(job_description_tokens) == 0:
    job_description_data.append(job_description_tokens)
  if not len(job_requirement_tokens) == 0:
    job_requirement_data.append(job_requirement_tokens)
  if not len(required_qualification_tokens) == 0:
    required_qualification_data.append(required_qualification_tokens)

CPU times: user 39min 38s, sys: 16min 13s, total: 55min 51s
Wall time: 28min 21s


# LDA - Latent Dirichlet Allocation

**Define Dictionary**

In [16]:
%%time

job_description_dictionary = corpora.Dictionary(job_description_data)
job_requirement_dictionary = corpora.Dictionary(job_requirement_data)
required_qualification_dictionary = corpora.Dictionary(required_qualification_data)

CPU times: user 1.87 s, sys: 72.9 ms, total: 1.94 s
Wall time: 1.87 s


**Filter out tokens that appear in -**

*   less than 15 documents (absolute number)
*   more than 0.5 documents (fraction of total corpus size, not absolute number)
*   after the above two steps, keep only the first 100000 most frequent tokens




In [0]:
# job_description_dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
# job_requirement_dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
# required_qualification_dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

**Print 10 example words from each of the three dictionaries formed**

In [18]:
count = 0
print("{} words present in job_description_dictionary.".format(len(job_description_dictionary)))
print("Examples from job_description_dictionary...")
for k, v in job_description_dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break
        
count = 0
print("\n{} words present in job_requirement_dictionary.".format(len(job_requirement_dictionary)))
print("Examples from job_requirement_dictionary...")
for k, v in job_requirement_dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break
        
count = 0
print("\n{} words present in required_qualification_dictionary.".format(len(required_qualification_dictionary)))
print("Examples from required_qualification_dictionary...")
for k, v in required_qualification_dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

7454 words present in job_description_dictionary.
Examples from job_description_dictionary...
0 ameria
1 assistance
2 chief
3 company
4 consult
5 director
6 executive
7 financial
8 function
9 highly
10 investment

9821 words present in job_requirement_dictionary.
Examples from job_requirement_dictionary...
0 accounting
1 action
2 activity
3 adequacy
4 administration
5 article
6 assigning
7 assist
8 board
9 budget
10 cash

7758 words present in required_qualification_dictionary.
Examples from required_qualification_dictionary...
0 ability
1 acca
2 accounting
3 accounting/
4 activity
5 administration
6 analysis
7 analytical
8 and/or
9 application
10 audience


**Define Bag-of-Word model Corpus**

In [19]:
%%time

job_description_bow_corpus = [job_description_dictionary.doc2bow(text) for text in job_description_data]
job_requirement_bow_corpus = [job_requirement_dictionary.doc2bow(text) for text in job_requirement_data]
required_qualification_bow_corpus = [required_qualification_dictionary.doc2bow(text) for text in required_qualification_data]

CPU times: user 1.34 s, sys: 67.1 ms, total: 1.41 s
Wall time: 1.41 s


**Preview Bag of Words on a sample pre-processed document**

In [20]:
print("Examples from job_description_bow_corpus...")
for i in range(len(job_description_bow_corpus[10])):
  print("Word {} (\"{}\") appears {} time.".format(job_description_bow_corpus[10][i][0], job_description_dictionary[job_description_bow_corpus[10][i][0]], job_description_bow_corpus[10][i][1]))

print("\nExamples from job_requirement_bow_corpus...")
for i in range(len(job_requirement_bow_corpus[10])):
  print("Word {} (\"{}\") appears {} time.".format(job_requirement_bow_corpus[10][i][0], job_requirement_dictionary[job_requirement_bow_corpus[10][i][0]], job_requirement_bow_corpus[10][i][1]))

print("\nExamples from required_qualification_bow_corpus...")
for i in range(len(required_qualification_bow_corpus[10])):
  print("Word {} (\"{}\") appears {} time.".format(required_qualification_bow_corpus[10][i][0], required_qualification_dictionary[required_qualification_bow_corpus[10][i][0]], required_qualification_bow_corpus[10][i][1]))

Examples from job_description_bow_corpus...
Word 14 ("position") appears 1 time.
Word 149 ("communication") appears 1 time.
Word 150 ("concept") appears 1 time.
Word 151 ("design") appears 1 time.
Word 152 ("designer") appears 1 time.
Word 153 ("experience") appears 1 time.
Word 154 ("field") appears 1 time.
Word 155 ("graphic") appears 1 time.
Word 156 ("medium") appears 1 time.
Word 157 ("since") appears 1 time.
Word 158 ("study") appears 1 time.

Examples from job_requirement_bow_corpus...
Word 11 ("client") appears 1 time.
Word 25 ("development") appears 1 time.
Word 28 ("documentation") appears 1 time.
Word 64 ("product") appears 1 time.
Word 82 ("team") appears 1 time.
Word 111 ("communication") appears 1 time.
Word 123 ("group") appears 1 time.
Word 137 ("project") appears 1 time.
Word 164 ("software") appears 1 time.
Word 266 ("also") appears 1 time.
Word 267 ("designer") appears 1 time.
Word 268 ("every") appears 1 time.
Word 269 ("graphic") appears 1 time.
Word 270 ("guide") 

**Define TF-IDF model Corpus**

In [21]:
%%time

job_description_tfidf_model = models.TfidfModel(job_description_bow_corpus)
job_description_tfidf_corpus = job_description_tfidf_model[job_description_bow_corpus]

job_requirement_tfidf_model = models.TfidfModel(job_requirement_bow_corpus)
job_requirement_tfidf_corpus = job_requirement_tfidf_model[job_requirement_bow_corpus]

required_qualification_tfidf_model = models.TfidfModel(required_qualification_bow_corpus)
required_qualification_tfidf_corpus = required_qualification_tfidf_model[required_qualification_bow_corpus]

CPU times: user 391 ms, sys: 5.2 ms, total: 396 ms
Wall time: 398 ms


**Save Dictionaries and Corpora**

In [0]:
pickle.dump(job_description_bow_corpus, open('job_description_bow_corpus.pkl', 'wb'))
pickle.dump(job_description_tfidf_corpus, open('job_description_tfidf_corpus.pkl', 'wb'))
job_description_dictionary.save('job_description_dictionary.gensim')

pickle.dump(job_requirement_bow_corpus, open('job_requirement_bow_corpus.pkl', 'wb'))
pickle.dump(job_requirement_tfidf_corpus, open('job_requirement_tfidf_corpus.pkl', 'wb'))
job_requirement_dictionary.save('job_requirement_dictionary.gensim')

pickle.dump(required_qualification_bow_corpus, open('required_qualification_bow_corpus.pkl', 'wb'))
pickle.dump(required_qualification_tfidf_corpus, open('required_qualification_tfidf_corpus.pkl', 'wb'))
required_qualification_dictionary.save('required_qualification_dictionary.gensim')

**Retrieve Number of CPU Cores in the VM**

In [23]:
!grep -c ^processor /proc/cpuinfo
# Set this value to NUM_OF_CORES

2


**Set Number of CPU Cores**

In [0]:
NUM_OF_CORES = 2

**Build LDA models**

In [25]:
%%time

JOB_DESCRIPTION_NUM_TOPICS = 20
job_description_bow_lda_model = gensim.models.LdaMulticore(job_description_bow_corpus, num_topics = JOB_DESCRIPTION_NUM_TOPICS, id2word=job_description_dictionary, passes=100, workers=NUM_OF_CORES)
job_description_bow_lda_model.save('job_description_bow_lda_model.gensim')
job_description_tfidf_lda_model = gensim.models.LdaMulticore(job_description_tfidf_corpus, num_topics = JOB_DESCRIPTION_NUM_TOPICS, id2word=job_description_dictionary, passes=100, workers=NUM_OF_CORES)
job_description_tfidf_lda_model.save('job_description_tfidf_lda_model.gensim')

CPU times: user 9min 23s, sys: 19 s, total: 9min 42s
Wall time: 12min 41s


In [26]:
%%time

JOB_REQUIREMENT_NUM_TOPICS = 20
job_requirement_bow_lda_model = gensim.models.LdaMulticore(job_requirement_bow_corpus, num_topics = JOB_REQUIREMENT_NUM_TOPICS, id2word=job_requirement_dictionary, passes=100, workers=NUM_OF_CORES)
job_requirement_bow_lda_model.save('job_requirement_bow_lda_model.gensim')
job_requirement_tfidf_lda_model = gensim.models.LdaMulticore(job_requirement_tfidf_corpus, num_topics = JOB_REQUIREMENT_NUM_TOPICS, id2word=job_requirement_dictionary, passes=100, workers=NUM_OF_CORES)
job_requirement_tfidf_lda_model.save('job_requirement_tfidf_lda_model.gensim')

CPU times: user 15min 38s, sys: 27.7 s, total: 16min 6s
Wall time: 20min


In [27]:
%%time

REQUIRED_QUALIFICATION_NUM_TOPICS = 20
required_qualification_bow_lda_model = gensim.models.LdaMulticore(required_qualification_bow_corpus, num_topics = REQUIRED_QUALIFICATION_NUM_TOPICS, id2word=required_qualification_dictionary, passes=100, workers=NUM_OF_CORES)
required_qualification_bow_lda_model.save('required_qualification_bow_lda_model.gensim')
required_qualification_tfidf_lda_model = gensim.models.LdaMulticore(required_qualification_tfidf_corpus, num_topics = REQUIRED_QUALIFICATION_NUM_TOPICS, id2word=required_qualification_dictionary, passes=100, workers=NUM_OF_CORES)
required_qualification_tfidf_lda_model.save('required_qualification_tfidf_lda_model.gensim')

CPU times: user 17min 39s, sys: 33.9 s, total: 18min 13s
Wall time: 22min 58s


**Display top 10 words in each topic of each LDA model**

In [28]:
job_description_bow_topics = job_description_bow_lda_model.print_topics(num_words=10)
print("Job Description Topics (BoW): ")
for idx, topic in job_description_bow_topics:
    print('Topic: {} \nWords: {}'.format(idx, topic))

job_description_tfidf_topics = job_description_tfidf_lda_model.print_topics(num_words=10)
print("\n\nJob Description Topics (TF-IDF): ")
for idx, topic in job_description_tfidf_topics:
    print('Topic: {} \nWords: {}'.format(idx, topic))

Job Description Topics (BoW): 
Topic: 0 
Words: 0.032*"policy" + 0.026*"management" + 0.023*"procedure" + 0.020*"compliance" + 0.019*"activity" + 0.018*"risk" + 0.017*"regulation" + 0.016*"security" + 0.013*"within" + 0.012*"water"
Topic: 1 
Words: 0.033*"self" + 0.032*"skill" + 0.028*"representative" + 0.028*"position" + 0.028*"company" + 0.026*"medical" + 0.022*"candidate" + 0.020*"team" + 0.019*"person" + 0.018*"insurance"
Topic: 2 
Words: 0.082*"program" + 0.060*"development" + 0.055*"armenia" + 0.044*"project" + 0.021*"usaid" + 0.020*"manager" + 0.019*"region" + 0.017*"sector" + 0.015*"yerevan" + 0.015*"position"
Topic: 3 
Words: 0.180*"project" + 0.048*"implementation" + 0.035*"management" + 0.028*"manager" + 0.026*"activity" + 0.024*"team" + 0.023*"monitoring" + 0.019*"plan" + 0.018*"incumbent" + 0.017*"evaluation"
Topic: 4 
Words: 0.081*"system" + 0.038*"network" + 0.033*"engineer" + 0.024*"maintenance" + 0.024*"support" + 0.023*"administrator" + 0.022*"database" + 0.020*"posit

In [29]:
job_requirement_bow_topics = job_requirement_bow_lda_model.print_topics(num_words=10)
print("Job Requirement Topics (BoW): ")
for idx, topic in job_requirement_bow_topics:
    print('Topic: {} \nWords: {}'.format(idx, topic))

job_requirement_tfidf_topics = job_requirement_tfidf_lda_model.print_topics(num_words=10)
print("\n\nJob Requirement Topics (TF-IDF): ")
for idx, topic in job_requirement_tfidf_topics:
    print('Topic: {} \nWords: {}'.format(idx, topic))

Job Requirement Topics (BoW): 
Topic: 0 
Words: 0.032*"vehicle" + 0.030*"guest" + 0.024*"office" + 0.021*"hotel" + 0.017*"maintenance" + 0.015*"safety" + 0.015*"security" + 0.015*"equipment" + 0.014*"service" + 0.013*"repair"
Topic: 1 
Words: 0.082*"call" + 0.056*"answer" + 0.049*"mail" + 0.044*"telephone" + 0.044*"staff" + 0.036*"duty" + 0.035*"visitor" + 0.035*"phone" + 0.031*"inquiry" + 0.026*"perform"
Topic: 2 
Words: 0.122*"test" + 0.050*"testing" + 0.040*"case" + 0.030*"plan" + 0.023*"tool" + 0.017*"software" + 0.016*"develop" + 0.016*"result" + 0.016*"script" + 0.015*"regression"
Topic: 3 
Words: 0.042*"medium" + 0.036*"material" + 0.026*"event" + 0.023*"website" + 0.022*"activity" + 0.020*"content" + 0.017*"information" + 0.015*"organize" + 0.015*"company" + 0.013*"organization"
Topic: 4 
Words: 0.033*"credit" + 0.032*"bank" + 0.031*"loan" + 0.028*"branch" + 0.025*"client" + 0.015*"document" + 0.015*"company" + 0.013*"contract" + 0.012*"organization" + 0.012*"banks"
Topic: 5 
W

In [30]:
required_qualification_bow_topics = required_qualification_bow_lda_model.print_topics(num_words=10)
print("Required Qualification Topics (BoW): ")
for idx, topic in required_qualification_bow_topics:
    print('Topic: {} \nWords: {}'.format(idx, topic))

required_qualification_tfidf_topics = required_qualification_tfidf_lda_model.print_topics(num_words=10)
print("\n\nRequired Qualification Topics (TF-IDF): ")
for idx, topic in required_qualification_tfidf_topics:
    print('Topic: {} \nWords: {}'.format(idx, topic))

Required Qualification Topics (BoW): 
Topic: 0 
Words: 0.082*"knowledge" + 0.076*"system" + 0.057*"network" + 0.045*"server" + 0.039*"windows" + 0.038*"experience" + 0.034*"administration" + 0.023*"linux" + 0.022*"security" + 0.020*"technology"
Topic: 1 
Words: 0.065*"experience" + 0.065*"skill" + 0.031*"degree" + 0.030*"language" + 0.028*"project" + 0.028*"knowledge" + 0.028*"years" + 0.026*"management" + 0.019*"communication" + 0.018*"computer"
Topic: 2 
Words: 0.079*"skill" + 0.068*"ability" + 0.058*"knowledge" + 0.050*"language" + 0.032*"communication" + 0.029*"experience" + 0.023*"work" + 0.022*"computer" + 0.021*"pressure" + 0.020*"office"
Topic: 3 
Words: 0.108*"skill" + 0.042*"experience" + 0.037*"ability" + 0.030*"knowledge" + 0.029*"management" + 0.028*"business" + 0.027*"communication" + 0.027*"marketing" + 0.026*"language" + 0.019*"degree"
Topic: 4 
Words: 0.094*"experience" + 0.031*"degree" + 0.030*"years" + 0.027*"training" + 0.024*"language" + 0.017*"health" + 0.016*"and

**Test LDA model on unseen data**

In [31]:
%%time

new_doc = "Investment Consulting Company is seeking a Chief Financial Officer. This position manages the company's fiscal and administrative functions, provides highly responsible and technically complex staff assistance to the Executive Director. The work performed requires a high level of technical proficiency in financial management and investment management, as well as management, supervisory and administrative skills."
document = parser(new_doc)

print("Noun Phrases...")
print("--- [Format: Noun Phrase -> Root Text] ---")
for noun_phrase in document.noun_chunks:
  #print(noun_phrase.text, noun_phrase.label_, noun_phrase.root.text)
  print(noun_phrase.text + " -> " + noun_phrase.root.text)

new_doc = prepare_text(new_doc)
new_doc_bow = job_description_dictionary.doc2bow(new_doc)
new_doc_tfidf = job_description_tfidf_model[new_doc_bow]

# print("\nTopics relevant to new document are: ")
# print(job_description_bow_lda_model.get_document_topics(new_doc_bow, minimum_probability=0.1))

counter = 0
print("\n\nTopics relevant to new document are (BoW): ")
for index, score in sorted(job_description_bow_lda_model[new_doc_bow], key=lambda tup: -1*tup[1]):
  if counter == 0:
    print("Score: {}\t Topic: {}".format(score, job_description_bow_lda_model.print_topic(index, 10)))
    highest_score = score
    counter = counter + 1
  elif highest_score - score <= 0.3:
    print("Score: {}\t Topic: {}".format(score, job_description_bow_lda_model.print_topic(index, 10)))
    counter = counter + 1
  else:
    break

counter = 0
print("\n\nTopics relevant to new document are (TF-IDF): ")
for index, score in sorted(job_description_tfidf_lda_model[new_doc_tfidf], key=lambda tup: -1*tup[1]):
  if counter == 0:
    print("Score: {}\t Topic: {}".format(score, job_description_tfidf_lda_model.print_topic(index, 10)))
    highest_score = score
    counter = counter + 1
  elif highest_score - score <= 0.3:
    print("Score: {}\t Topic: {}".format(score, job_description_tfidf_lda_model.print_topic(index, 10)))
    counter = counter + 1
  else:
    break

Noun Phrases...
--- [Format: Noun Phrase -> Root Text] ---
Investment Consulting Company -> Company
a Chief Financial Officer -> Officer
This position -> position
the company's fiscal and administrative functions -> functions
highly responsible and technically complex staff assistance -> assistance
the Executive Director -> Director
The work -> work
a high level -> level
technical proficiency -> proficiency
financial management and investment management -> management
management -> management
skills -> skills


Topics relevant to new document are (BoW): 
Score: 0.5682087540626526	 Topic: 0.045*"director" + 0.038*"company" + 0.037*"accounting" + 0.032*"chief" + 0.030*"accountant" + 0.029*"management" + 0.024*"supervision" + 0.022*"incumbent" + 0.022*"finance" + 0.021*"executive"
Score: 0.2818472385406494	 Topic: 0.047*"opportunity" + 0.035*"team" + 0.030*"role" + 0.026*"incumbent" + 0.024*"development" + 0.019*"code" + 0.018*"intern" + 0.014*"management" + 0.014*"product" + 0.014*"intern

**Load persisted dictionaries, corpora and LDA models**

In [0]:
job_description_dictionary = gensim.corpora.Dictionary.load('job_description_dictionary.gensim')
job_description_bow_corpus = pickle.load(open('job_description_bow_corpus.pkl', 'rb'))
job_description_bow_lda_model = gensim.models.ldamodel.LdaModel.load('job_description_bow_lda_model.gensim')
job_description_tfidf_corpus = pickle.load(open('job_description_tfidf_corpus.pkl', 'rb'))
job_description_tfidf_lda_model = gensim.models.ldamodel.LdaModel.load('job_description_tfidf_lda_model.gensim')

job_requirement_dictionary = gensim.corpora.Dictionary.load('job_requirement_dictionary.gensim')
job_requirement_bow_corpus = pickle.load(open('job_requirement_bow_corpus.pkl', 'rb'))
job_requirement_bow_lda_model = gensim.models.ldamodel.LdaModel.load('job_requirement_bow_lda_model.gensim')
job_requirement_tfidf_corpus = pickle.load(open('job_requirement_tfidf_corpus.pkl', 'rb'))
job_requirement_tfidf_lda_model = gensim.models.ldamodel.LdaModel.load('job_requirement_tfidf_lda_model.gensim')

required_qualification_dictionary = gensim.corpora.Dictionary.load('required_qualification_dictionary.gensim')
required_qualification_bow_corpus = pickle.load(open('required_qualification_bow_corpus.pkl', 'rb'))
required_qualification_bow_lda_model = gensim.models.ldamodel.LdaModel.load('required_qualification_bow_lda_model.gensim')
required_qualification_tfidf_corpus = pickle.load(open('required_qualification_tfidf_corpus.pkl', 'rb'))
required_qualification_tfidf_lda_model = gensim.models.ldamodel.LdaModel.load('required_qualification_tfidf_lda_model.gensim')

**Visualize all the built LDA models **

In [33]:
job_description_bow_lda_display = pyLDAvis.gensim.prepare(job_description_bow_lda_model, job_description_bow_corpus, job_description_dictionary, sort_topics=True, mds='mmds')
pyLDAvis.save_html(job_description_bow_lda_display, 'job_description_bow_lda.html')
pyLDAvis.display(job_description_bow_lda_display)

In [34]:
job_description_tfidf_lda_display = pyLDAvis.gensim.prepare(job_description_tfidf_lda_model, job_description_tfidf_corpus, job_description_dictionary, sort_topics=True, mds='mmds')
pyLDAvis.save_html(job_description_tfidf_lda_display, 'job_description_tfidf_lda.html')
pyLDAvis.display(job_description_tfidf_lda_display)

In [35]:
job_requirement_bow_lda_display = pyLDAvis.gensim.prepare(job_requirement_bow_lda_model, job_requirement_bow_corpus, job_requirement_dictionary, sort_topics=True, mds='mmds')
pyLDAvis.save_html(job_requirement_bow_lda_display, 'job_requirement_bow_lda.html')
pyLDAvis.display(job_requirement_bow_lda_display)

In [36]:
job_requirement_tfidf_lda_display = pyLDAvis.gensim.prepare(job_requirement_tfidf_lda_model, job_requirement_tfidf_corpus, job_requirement_dictionary, sort_topics=True, mds='mmds')
pyLDAvis.save_html(job_requirement_tfidf_lda_display, 'job_requirement_tfidf_lda.html')
pyLDAvis.display(job_requirement_tfidf_lda_display) 

In [37]:
required_qualification_bow_lda_display = pyLDAvis.gensim.prepare(required_qualification_bow_lda_model, required_qualification_bow_corpus, required_qualification_dictionary, sort_topics=True, mds='mmds')
pyLDAvis.save_html(required_qualification_bow_lda_display, 'required_qualification_bow_lda.html')
pyLDAvis.display(required_qualification_bow_lda_display)

In [38]:
required_qualification_tfidf_lda_display = pyLDAvis.gensim.prepare(required_qualification_tfidf_lda_model, required_qualification_tfidf_corpus, required_qualification_dictionary, sort_topics=True, mds='mmds')
pyLDAvis.save_html(required_qualification_tfidf_lda_display, 'required_qualification_tfidf_lda.html')
pyLDAvis.display(required_qualification_tfidf_lda_display)

# Word2Vec Model

**Visualize Word2Vec model on a scatter plot using TSNE**

In [0]:
def visualize_data(data_array):
  tsne = TSNE(n_components=2)
  X_tsne = tsne.fit_transform(data_array)

  plt.rcParams['figure.figsize'] = [10, 10]
  plt.scatter(X_tsne[:, 0], X_tsne[:, 1])
  plt.show()

**Visualize Word2Vec model on an interatcive scatter plot using TSNE and Bokeh.JS**

In [0]:
from bokeh.plotting import figure, show
from bokeh.io import push_notebook, output_notebook
from bokeh.models import ColumnDataSource, LabelSet

In [0]:
def interactive_tsne(text_labels, data_array):
    '''makes an interactive scatter plot with text labels for each point'''

    tsne = TSNE(n_components=2)
    tsne_array = tsne.fit_transform(data_array)
    
    # define a dataframe to be used by bokeh context
    bokeh_df = pd.DataFrame(tsne_array, text_labels, columns=['x','y'])
    bokeh_df['text_labels'] = bokeh_df.index

    # interactive controls to include to the plot
    TOOLS="hover, zoom_in, zoom_out, box_zoom, undo, redo, reset, box_select"

    p = figure(tools=TOOLS, plot_width=700, plot_height=700)

    # define data source for the plot
    source = ColumnDataSource(bokeh_df)

    # scatter plot
    p.scatter('x', 'y', source=source, fill_alpha=0.6,
              fill_color="#8724B5",
              line_color=None)

    # text labels
    labels = LabelSet(x='x', y='y', text='text_labels', y_offset=8,
                      text_font_size="8pt", text_color="#555555",
                      source=source, text_align='center')

    p.add_layout(labels)

    # show plot inline
    output_notebook()
    show(p)

**Training Vanilla Word2Vec Model**

In [0]:
%%time

# word2vec expects a list of list: each document is a list of tokens
model = Word2Vec(sentences=job_description_data, # tokenized senteces, list of list of strings
                 size=300,  # size of embedding vectors
                 workers=4, # how many threads?
                 min_count=20, # minimum frequency per token, filtering rare words
                 sample=0.05, # weight of downsampling common words
                 sg = 0, # should we use skip-gram? if 0, then cbow
                 iter=5, # number of iterations
                 hs = 0
        )

X = model[model.wv.vocab]

# summarize the loaded model
print("Summarizing model...")
print(model)

# summarize vocabulary
words = list(model.wv.vocab)
print("\nSummarizing Vocabulary...")
print(words)

# access vector for one word
print("\nWord2Vec for the word - Engineer: ")
print(model['engineer'])

# save model
model.save('model.bin')

# load model
model = Word2Vec.load('model.bin')
print("Successfully loaded model...")
print(model)

In [0]:
# visualize data
visualize_data(X)

In [0]:
#visualize data interactively
interactive_tsne(model.wv.vocab.keys(), X)

**Training Word2Vec Model with Part-of_Speech tags**

In [0]:
%%time 

sent_w_pos = [nltk.pos_tag(d) for d in job_description_data]
sents = [[tup[0]+tup[1] for tup in d] for d in sent_w_pos]

model_pos = Word2Vec(sentences=sents,
                 size=300,
                 workers=4,
                 min_count=20,
                 sample=0.05,
                 sg = 0,
                 hs=0,
                 iter=5
        )


X_pos = model_pos[model_pos.wv.vocab]

# summarize the loaded model
print("Summarizing model...")
print(model_pos)

# summarize vocabulary
words = list(model_pos.wv.vocab)
print("\nSummarizing Vocabulary...")
print(words)

# access vector for one word
print("\nWord2Vec for the word - Engineer: ")
print(model_pos['engineerNN'])

# save model
model_pos.save('model_pos.bin')

# load model
model_pos = Word2Vec.load('model_pos.bin')
print("Successfully loaded model...")
print(model_pos)

In [0]:
# visualize data
visualize_data(X_pos)

In [0]:
#visualize data interactively
interactive_tsne(model_pos.wv.vocab.keys(), X)

# List all files in current directory

In [39]:
!ls -ltr

total 147820
-rw-r--r-- 1 root root 95435519 Apr  3 03:12 online-job-posts.csv
drwxr-xr-x 1 root root     4096 Apr  4 20:20 sample_data
-rw-r--r-- 1 root root 13254376 May  1 05:47 online-job-posts.zip
-rw-r--r-- 1 root root  2316149 May  1 06:16 job_description_bow_corpus.pkl
-rw-r--r-- 1 root root  2607134 May  1 06:16 job_description_tfidf_corpus.pkl
-rw-r--r-- 1 root root   248274 May  1 06:16 job_description_dictionary.gensim
-rw-r--r-- 1 root root  5715308 May  1 06:16 job_requirement_bow_corpus.pkl
-rw-r--r-- 1 root root  6098902 May  1 06:16 job_requirement_tfidf_corpus.pkl
-rw-r--r-- 1 root root   329148 May  1 06:16 job_requirement_dictionary.gensim
-rw-r--r-- 1 root root  5714777 May  1 06:16 required_qualification_bow_corpus.pkl
-rw-r--r-- 1 root root  6017765 May  1 06:16 required_qualification_tfidf_corpus.pkl
-rw-r--r-- 1 root root   260200 May  1 06:16 required_qualification_dictionary.gensim
-rw-r--r-- 1 root root   864826 May  1 06:23 job_description_bow_lda_model.gen

# Download all dictionaries, corpora, models and output files

In [0]:
files.download("job_description_dictionary.gensim")
files.download("job_description_bow_corpus.pkl")
files.download("job_description_tfidf_corpus.pkl")
files.download("job_description_bow_lda_model.gensim")
files.download("job_description_bow_lda_model.gensim.state")
files.download("job_description_bow_lda_model.gensim.id2word")
files.download("job_description_bow_lda_model.gensim.expElogbeta.npy")
files.download("job_description_tfidf_lda_model.gensim")
files.download("job_description_tfidf_lda_model.gensim.state")
files.download("job_description_tfidf_lda_model.gensim.id2word")
files.download("job_description_tfidf_lda_model.gensim.expElogbeta.npy")

files.download("job_requirement_dictionary.gensim")
files.download("job_requirement_bow_corpus.pkl")
files.download("job_requirement_tfidf_corpus.pkl")
files.download("job_requirement_bow_lda_model.gensim")
files.download("job_requirement_bow_lda_model.gensim.state")
files.download("job_requirement_bow_lda_model.gensim.id2word")
files.download("job_requirement_bow_lda_model.gensim.expElogbeta.npy")
files.download("job_requirement_tfidf_lda_model.gensim")
files.download("job_requirement_tfidf_lda_model.gensim.state")
files.download("job_requirement_tfidf_lda_model.gensim.id2word")
files.download("job_requirement_tfidf_lda_model.gensim.expElogbeta.npy")

files.download("required_qualification_dictionary.gensim")
files.download("required_qualification_bow_corpus.pkl")
files.download("required_qualification_tfidf_corpus.pkl")
files.download("required_qualification_bow_lda_model.gensim")
files.download("required_qualification_bow_lda_model.gensim.state")
files.download("required_qualification_bow_lda_model.gensim.id2word")
files.download("required_qualification_bow_lda_model.gensim.expElogbeta.npy")
files.download("required_qualification_tfidf_lda_model.gensim")
files.download("required_qualification_tfidf_lda_model.gensim.state")
files.download("required_qualification_tfidf_lda_model.gensim.id2word")
files.download("required_qualification_tfidf_lda_model.gensim.expElogbeta.npy")

files.download("job_description_bow_lda.html")
files.download("job_description_tfidf_lda.html")
files.download("job_requirement_bow_lda.html")
files.download("job_requirement_tfidf_lda.html")
files.download("required_qualification_bow_lda.html")
files.download("required_qualification_tfidf_lda.html")