In [1]:
import numpy as np
import pandas as pd

from sklearn.decomposition import NMF

In [2]:
df = pd.read_pickle('Data/clean_seattle_ngram_df')

In [3]:
len(df)

2908

## Testing TF-IDF NMF with new ngrams

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
tf_idf = TfidfVectorizer(max_df=0.95)
tf_idf_array = tf_idf.fit_transform(df.listed_items).toarray()
tf_idf_df = pd.DataFrame(tf_idf_array,columns=tf_idf.get_feature_names())
tf_idf_df.shape

(2908, 10016)

In [6]:
nmf_model = NMF(n_components=10, random_state=42)
nmf = nmf_model.fit_transform(tf_idf_df)

In [7]:
W = nmf
H = nmf_model.components_

The W factor contains the document membership weights relative to each of the k topics. Each row corresponds to a single document, and each column correspond to a topic.

In [8]:
W.shape

(2908, 10)

The H factor contains the term weights relative to each of the k topics. In this case, each row corresponds to a topic, and each column corresponds to a unique term in the corpus vocabulary.

In [9]:
H.shape

(10, 10016)

In [10]:
top_indices = np.argsort(H[1,:])[::-1]

In [11]:
def get_descriptor(terms, H, topic_index, top):
    #reverse sort the values to sort the indices
    top_indices = np.argsort(H[topic_index,:])[::-1]
    top_terms = []
    for term_index in top_indices[0:top]:
        top_terms.append(terms[term_index])
    return top_terms

In [12]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [13]:
print_top_words(nmf_model,tf_idf.get_feature_names(),10)

Topic #0:
experi system softwar servic cloud technolog develop build engin scale
Topic #1:
comput_scienc_fundament comput scienc fundament degre_comput_scienc_relat object_orient_design object orient bachelor_degre_comput_scienc algorithm
Topic #2:
learn machin machin_learn model experi statist comput languag phd algorithm
Topic #3:
market custom product sale content strategi execut busi partner develop
Topic #4:
requir support project document process system assist provid manag maintain
Topic #5:
busi analyt statist analysi sql model report experi quantit insight
Topic #6:
test autom develop experi qa qualiti softwar framework methodolog script
Topic #7:
paid compani custom benefit employe user time dental dog help
Topic #8:
manag project abil busi technic skill team communic experi function
Topic #9:
design product experi engin manufactur develop research prototyp user softwar

