In [11]:
# Download spacy models
#!python -m spacy download en_core_web_lg

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy
from spacy.lang.en import English

In [2]:
def spacy_tokenizer(doc):
    """
    Tokenizing and lemmatizing the document using SpaCy
    :param doc: text
    :return:
    """
    spacy.load('en_core_web_lg')
    lemmatizer = spacy.lang.en.English()
    tokens = lemmatizer(doc)
    return [token.lemma_ for token in tokens]

def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Lower case of all words
    2. Remove all punctuation
    3. Remove all stopwords
    4. Returns a list of the cleaned text
    """
    punctuations = '!"$%&\'()*,-./:;<=>?@[\\]^_`{|}~'

    # transforms all to lower case words
    mess = mess.lower()

    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in punctuations]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)

    # Now just remove any stopwords
    return [word for word in nopunc.split() if word not in spacy.lang.en.stop_words.STOP_WORDS]


# Load models from Pickle files

In [3]:
with open('/Users/jamoth/DSR/DataScienceJobs/Pickles/BOG_model.pkl', 'rb') as file:
    BOG, BOG_fit, BOG_transform = pickle.load(file)

In [4]:
sum_words = BOG_transform.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in BOG_fit.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
print('20 most frequent words in the job descriptions:')
words_freq[:20]

20 most frequent words in the job descriptions:


[('data', 64665),
 ('experience', 33250),
 ('business', 21146),
 ('work', 20959),
 ('team', 19969),
 ('skills', 15538),
 ('working', 12020),
 ('learning', 11290),
 ('analytics', 11020),
 ('company', 10116),
 ('role', 9738),
 ('management', 9628),
 ('development', 9617),
 ('ability', 9282),
 ('new', 9248),
 ('analysis', 9115),
 ('research', 9102),
 ('knowledge', 8779),
 ('science', 8689),
 ('support', 8616)]

In [5]:
with open('/Users/jamoth/DSR/DataScienceJobs/Pickles/TFIDF_model.pkl', 'rb') as file:
    TFIDF, TFIDF_fit, TFIDF_transform = pickle.load(file)

In [6]:
first_vector_spacy = TFIDF_transform[7]
df_first_vector_tfidf_spacy = pd.DataFrame(first_vector_spacy.T.todense(), index=TFIDF.get_feature_names(), columns=["tfidf"])
df_first_vector_tfidf_spacy.sort_values(by=["tfidf"],ascending=False).head(50)

Unnamed: 0,tfidf
hr,0.540552
jet2com,0.172459
jet2holidays,0.172459
leisure,0.147533
database,0.145222
queries,0.139897
relating,0.114131
openhr,0.103745
holiday,0.101715
essentialin,0.099579
