In [1]:
import numpy as np

import nltk
import pandas as pd

from sklearn.decomposition import NMF
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer

import pickle

## Import ngramed pickle, vectorize

In [2]:
df = pd.read_pickle('../Data/01_clean_sf_custom_ngram')
df.shape

(3760, 4)

In [3]:
tf_idf = TfidfVectorizer(max_df=0.95)
tf_idf_array = tf_idf.fit_transform(df.listed_items).toarray()
tf_idf_df = pd.DataFrame(tf_idf_array,columns=tf_idf.get_feature_names())
tf_idf_df.shape

(3760, 17982)

## Pickling models/vectorizers for use in the Flask App

In [4]:
pickle_out = open('../Tools_and_models/tf_idf_model',"wb")
pickle.dump(tf_idf, pickle_out)
pickle_out.close()

In [5]:
pickle_out = open('../Tools_and_models/tf_idf_array',"wb")
pickle.dump(tf_idf_array, pickle_out)
pickle_out.close()

In [6]:
df.columns = ['company_name', 'job_title', 'listed_items', 'posting_url']

In [7]:
df = df.merge(tf_idf_df,left_index=True,right_index=True)
df.to_pickle('../Data/01_tf_idf_and_features')
df.head(2)

Unnamed: 0,company_name,job_title,listed_items,posting_url,aa,aaa,aac,aadbstrong_organizational,aai,aalas,...,zoneroot_cause,zonestrong_organizational,zoning,zookeeper,zoom,zoura,zsfg,zuora,zweigwhite,zymergen
0,Affimedix Inc,Scientist - Molecular Biology,molecular cloning dna library library screenin...,https://www.indeed.com/company/Affimedix-Inc/j...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"AllAccem, Inc.","Associate Chemist, Production Chemistry",assist carrying sop chemical reaction gram mul...,"https://www.indeed.com/company/AllAccem,-Inc./...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Topic Modeling based on elbows found in KMeans clustering Inertia

Note that only the clustering of TF-IDF was used as word2vec did not provide salient elbows.  During my modeling I used the pretrained version of word2vec which was trained on google news. The lack of elbows in plotting inertia likely results from the fact that 'cloud', for example, was not used in relation to cloud computing in the training set. As a result, word2vec would not know to associate terms such 'cloud' and 'azure'. 

I ultimately decided to use 9 classes during topic modeling. While there was no elbow at 9 while plotting the inertia, it should not be surprising that the locations of elbows while plotting inertia did not correlate directly to the number of topics. Afterall, part of the difficulty in navigating data science job descriptions is that the different roles within the field of data science may require different skill sets (or in terms of how that translates to my modeling, how jobs require different _'topics'_).

## 9 Classes

In [8]:
nmf_model = NMF(n_components=9, random_state=42)
nmf = nmf_model.fit_transform(tf_idf_df)

In [9]:
W = nmf
H = nmf_model.components_

The W factor contains the document membership weights relative to each of the k topics. Each row corresponds to a single document, and each column correspond to a topic.

In [10]:
W.shape

(3760, 9)

The H factor contains the term weights relative to each of the k topics. In this case, each row corresponds to a topic, and each column corresponds to a unique term in the corpus vocabulary.

In [11]:
H.shape

(9, 17982)

In [12]:
def get_descriptor(terms, H, topic_index, top):
    #reverse sort the values to sort the indices
    top_indices = np.argsort(H[topic_index,:])[::-1]
    top_terms = []
    for term_index in top_indices[0:top]:
        top_terms.append(terms[term_index])
    return top_terms

In [13]:
def list_top_wrods(model, feature_names, n_top_words):
    top_words = []
    for topic in model.components_:
        top_words.append([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
    return top_words

In [14]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
    print()

In [15]:
print_top_words(nmf_model,tf_idf.get_feature_names(),10)

Topic #0:
business analysis analytics insight sql quantitative analytical decision reporting statistical
Topic #1:
learning machine algorithm model deep ml ai python tensorflow technique
Topic #2:
project management process support client technical assist required quality manage
Topic #3:
web test application testing javascript end development framework react cs
Topic #4:
design product research designer ux visual interaction engineer prototype mobile
Topic #5:
customer product sale marketing strategy technical partner team drive management
Topic #6:
biology assay cell clinical laboratory molecular scientific method analysis study
Topic #7:
security network vulnerability cloud threat application secure infrastructure incident technical
Topic #8:
pipeline aws spark cloud distributed platform infrastructure database scale java

