In [8]:
from gensim.scripts.glove2word2vec import glove2word2vec
import gensim.downloader as api
import pandas as pd

In [4]:
# Downloads pre-trained glove model
model = api.load('glove-wiki-gigaword-300')



In [5]:
# exmple query to get word similar to king and women
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

print('Most similar word to King + Woman: ', result)

Most similar word to King + Woman:  [('queen', 0.6713276505470276)]


In [7]:
# example query to get word most similiar to some cybersecurity ksa skills
result = model.most_similar(positive=['cyber','security', 'network', 'intrusion'], topn=5)

print('Most similar word: ', result)

Most similar word:  [('networks', 0.5996201038360596), ('internet', 0.580498456954956), ('surveillance', 0.5709319710731506), ('threat', 0.518102765083313), ('threats', 0.5028305649757385)]


In [9]:
# usa jobs file
file = './Data/Cleaned Data/USAJobs.csv'

In [10]:
# load file into pandas and create single desc col
df = pd.read_csv(file)

In [11]:
df['desc'] = df['Duties'] + df['Qualifications']

In [63]:
texts = df['desc'].tolist()

In [57]:
# Import libraries to build Word2Vec model, and load Newsgroups data
import os
import sys
import re
from gensim.models import Word2Vec
from gensim.models.phrases import Phraser, Phrases
from gensim.parsing.preprocessing import remove_stopwords

In [66]:
# remove stopwords from text
texts = [remove_stopwords(str(text).lower()) for text in texts]

In [67]:
# Cleaning data - remove punctuation from every text
sentences = []
# Go through each text in turn
for ii in range(len(texts)):
    sentences = [re.sub(pattern=r'[\!"#$%&\*+,-./:;<=>?@^_`()|~=]', 
                        repl='', 
                        string=x
                       ).strip().split(' ') for x in str(texts[ii]).split('\n')]
    sentences = [x for x in sentences if x != ['']]
    texts[ii] = sentences

In [68]:
# gets all sentences and adds to text to compare/embed
all_sentences = []
for text in texts:
    all_sentences += text

In [69]:
# Phrase Detection
# Give some common terms that can be ignored in phrase detection
# For example, 'state_of_affairs' will be detected because 'of' is provided here: 
common_terms = ["of", "with", "without", "and", "or", "the", "a"]
# Create the relevant phrases from the list of sentences:
phrases = Phrases(all_sentences, connector_words=common_terms)
# The Phraser object is used from now on to transform sentences
bigram = Phraser(phrases)

# Applying the Phraser to transform our sentences is simply
all_sentences = list(bigram[all_sentences])

In [70]:
#  creates model using Word2Vec to create new word embeddings based on the words in the corpus texts
model = Word2Vec(all_sentences, 
                 min_count=3,   # Ignore words that appear less than this
                 vector_size=300,      # Dimensionality of word embeddings
                 workers=4,     # Number of processors (parallelisation)
                 window=5,      # Context window for words during training
                 epochs=50)       # Number of epochs training over corpus

In [71]:
model.vector_size

300

In [72]:
# total vocab (i think the pre-trained has a few million so not sure if this is better or not)
len(model.wv)

26969

In [74]:
# sample query to see what words are most similar to some cyber ksas
result = model.wv.most_similar(positive=['network', 'security'])

print('Most similar word: ', result)

Most similar word:  [('systems', 0.7998062968254089), ('cybersecurity', 0.6941286325454712), ('cyber', 0.6395346522331238), ('information', 0.609721302986145), ('management', 0.6047764420509338), ('technical', 0.5650832056999207), ('software', 0.5505338311195374), ('program', 0.5496156811714172), ('operations', 0.5478859543800354), ('compliance', 0.5327330827713013)]
