In [6]:
import numpy as np

import nltk
import pandas as pd

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

import pickle

In [2]:
df = pd.read_pickle('../Data/01_clean_sf_custom_ngram')

In [3]:
len(df)

3416

## 10 Classes

In [8]:
tf_idf = TfidfVectorizer(max_df=0.95)
tf_idf_vectorizer = tf_idf.fit(df.listed_items)
tf_idf_array = tf_idf.fit_transform(df.listed_items).toarray()
tf_idf_df = pd.DataFrame(tf_idf_array,columns=tf_idf.get_feature_names())
tf_idf_df.shape

(3416, 15951)

``` python 
pickle_out = open('../Tools_and_models/tf_idf_model',"wb")
pickle.dump(tf_idf, pickle_out)
pickle_out.close()
```

``` python 
pickle_out = open('../Tools_and_models/tf_idf_vectorizer',"wb")
pickle.dump(tf_idf_vectorizer, pickle_out)
pickle_out.close()
```

In [5]:
df.columns = ['company_name', 'job_title', 'listed_items', 'posting_url']

In [6]:
df = df.merge(tf_idf_df,left_index=True,right_index=True)
df.to_pickle('../Data/01_tf_idf_and_features')
df.head(2)

Unnamed: 0,company_name,job_title,listed_items,posting_url,00,000,000ad_hoc,000best_practice,000machine_learning,000skill_experience,...,zookeeperend_end,zoom,zoura,zuckerberg,zurb,zvs,zweigwhite,zymergen,zymo,zynga
0,Gap Inc. Corporate,"Software Engineer, Price Execution",write build product according business need co...,https://www.indeed.com/rc/clk?jk=77d524a7cf198...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,WrkShp,Business Analyst,closely product assist investigation deep dive...,https://www.indeed.com/company/WrkShp/jobs/Bus...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
nmf_model = NMF(n_components=10, random_state=42)
nmf = nmf_model.fit_transform(tf_idf_df)

In [6]:
W = nmf
H = nmf_model.components_

The W factor contains the document membership weights relative to each of the k topics. Each row corresponds to a single document, and each column correspond to a topic.

In [7]:
W.shape

(3416, 10)

The H factor contains the term weights relative to each of the k topics. In this case, each row corresponds to a topic, and each column corresponds to a unique term in the corpus vocabulary.

In [8]:
H.shape

(10, 15951)

In [9]:
top_indices = np.argsort(H[1,:])[::-1]

In [10]:
def get_descriptor(terms, H, topic_index, top):
    #reverse sort the values to sort the indices
    top_indices = np.argsort(H[topic_index,:])[::-1]
    top_terms = []
    for term_index in top_indices[0:top]:
        top_terms.append(terms[term_index])
    return top_terms

In [11]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [12]:
print_top_words(nmf_model,tf_idf.get_feature_names(),10)

Topic #0:
product customer technical team business market roadmap experience cross need
Topic #1:
experience system software technology java database test year service application
Topic #2:
cell biology assay experience laboratory molecular scientific method chemistry protein
Topic #3:
learning machine machine_learning model algorithm ml deep science deep_learning tensorflow
Topic #4:
benefit dental paid vision medical lunch employee insurance medical_dental commuter
Topic #5:
business analysis analytics insight sql model statistical experience tool quantitative
Topic #6:
project process management required ability support system requirement client business
Topic #7:
marketing sale content campaign market channel customer strategy digital skill
Topic #8:
design user research experience product designer visual web mobile interaction
Topic #9:
security network system infrastructure vulnerability incident threat cloud application linux



## 15 Classes

In [13]:
tf_idf = TfidfVectorizer(max_df=0.95)
tf_idf_array = tf_idf.fit_transform(df.listed_items).toarray()
tf_idf_df = pd.DataFrame(tf_idf_array,columns=tf_idf.get_feature_names())
tf_idf_df.shape

(3416, 15951)

In [14]:
nmf_model = NMF(n_components=15, random_state=42)
nmf = nmf_model.fit_transform(tf_idf_df)

In [15]:
W = nmf
H = nmf_model.components_

The W factor contains the document membership weights relative to each of the k topics. Each row corresponds to a single document, and each column correspond to a topic.

In [16]:
W.shape

(3416, 15)

The H factor contains the term weights relative to each of the k topics. In this case, each row corresponds to a topic, and each column corresponds to a unique term in the corpus vocabulary.

In [17]:
H.shape

(15, 15951)

In [18]:
top_indices = np.argsort(H[1,:])[::-1]

In [19]:
def get_descriptor(terms, H, topic_index, top):
    #reverse sort the values to sort the indices
    top_indices = np.argsort(H[topic_index,:])[::-1]
    top_terms = []
    for term_index in top_indices[0:top]:
        top_terms.append(terms[term_index])
    return top_terms

In [20]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [21]:
print_top_words(nmf_model,tf_idf.get_feature_names(),10)

Topic #0:
product experience team roadmap engineering feature market cross product_management strategy
Topic #1:
learning machine machine_learning algorithm model ml deep deep_learning science tensorflow
Topic #2:
process project manufacturing system equipment quality control design material mechanical
Topic #3:
analysis analytics business insight statistical sql tool quantitative statistic model
Topic #4:
paid dental vision lunch insurance benefit medical company health flexible
Topic #5:
business project client management process solution ability functional requirement technology
Topic #6:
customer technical support sale service issue solution need provide software
Topic #7:
cell biology assay molecular scientific experience protein laboratory development method
Topic #8:
design user research visual designer experience interaction ux prototyping prototype
Topic #9:
security network system infrastructure vulnerability incident threat cloud linux application
Topic #10:
marketing sale c

## 15 Classes

In [22]:
tf_idf = TfidfVectorizer(max_df=0.95)
tf_idf_array = tf_idf.fit_transform(df.listed_items).toarray()
tf_idf_df = pd.DataFrame(tf_idf_array,columns=tf_idf.get_feature_names())
tf_idf_df.shape

(3416, 15951)

In [23]:
nmf_model = NMF(n_components=17, random_state=42)
nmf = nmf_model.fit_transform(tf_idf_df)

In [24]:
W = nmf
H = nmf_model.components_

The W factor contains the document membership weights relative to each of the k topics. Each row corresponds to a single document, and each column correspond to a topic.

In [25]:
W.shape

(3416, 17)

The H factor contains the term weights relative to each of the k topics. In this case, each row corresponds to a topic, and each column corresponds to a unique term in the corpus vocabulary.

In [26]:
H.shape

(17, 15951)

In [27]:
top_indices = np.argsort(H[1,:])[::-1]

In [28]:
def get_descriptor(terms, H, topic_index, top):
    #reverse sort the values to sort the indices
    top_indices = np.argsort(H[topic_index,:])[::-1]
    top_terms = []
    for term_index in top_indices[0:top]:
        top_terms.append(terms[term_index])
    return top_terms

In [29]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [30]:
print_top_words(nmf_model,tf_idf.get_feature_names(),10)

Topic #0:
product team market roadmap feature cross product_management strategy define drive
Topic #1:
learning machine machine_learning model algorithm ml deep deep_learning tensorflow production
Topic #2:
process project manufacturing equipment system quality design material mechanical control
Topic #3:
analysis analytics insight business statistical sql tool quantitative statistic model
Topic #4:
paid dental lunch vision benefit insurance medical company health 401k
Topic #5:
business project management solution process requirement functional technical partner client
Topic #6:
skill ability must strong research communication client project excel detail
Topic #7:
cell biology assay molecular scientific experience laboratory protein biochemistry development
Topic #8:
design user research visual designer experience interaction ux prototyping prototype
Topic #9:
security network system infrastructure vulnerability incident threat cloud linux application
Topic #10:
marketing sale content