In [1]:
import numpy as np
import pandas as pd

import gensim

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_pickle('../Data/00_clean_data')

In [3]:
len(df)

11643

### Combine documents and split into one list of words

In [4]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))


In [5]:
clean_sents = list(sent_to_words(df.listed_items))

### Create ngrams from corpus

In [6]:

# note that the value_count for the 210th bigram is 331
# note that the value_count for the 100th trigram is 169
# note that the value_count for the 100th trigram is 169

In [7]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(clean_sents, min_count=331) 
trigram = gensim.models.Phrases(bigram[clean_sents], min_count=169)  
quadgram = gensim.models.Phrases(bigram[clean_sents], min_count=124)  

bigram_model = gensim.models.phrases.Phraser(bigram)
trigram_model = gensim.models.phrases.Phraser(trigram)
quadgram_model = gensim.models.phrases.Phraser(quadgram)



In [8]:
clean_sents = [quadgram_model[trigram_model[bigram_model[t]]] for t in clean_sents]

In [30]:
df.drop('listed_items',axis=1,inplace=True)
df.reset_index(drop=True,inplace=True)

In [32]:
joined_clean_sents = []
for clean_sent in clean_sents:
    joined_clean_sents.append(' '.join(clean_sent))

In [35]:
clean_sents_df = pd.DataFrame(joined_clean_sents,columns=['listed_items'])

In [40]:
df = df.merge(clean_sents_df,left_index=True,right_index=True)

### Combine ngrams with document text

In [41]:
df.to_pickle('../Data/00_clean_gensim_ngram')

## Testing new ngrams

In [12]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

In [42]:
tf_idf = TfidfVectorizer(max_df=0.95)
tf_idf_array = tf_idf.fit_transform(df.listed_items).toarray()
tf_idf_df = pd.DataFrame(tf_idf_array,columns=tf_idf.get_feature_names())
tf_idf_df.shape

(11643, 24123)

In [43]:
nmf_model = NMF(n_components=10, random_state=42)
nmf = nmf_model.fit_transform(tf_idf_df)

In [44]:
W = nmf
H = nmf_model.components_

The W factor contains the document membership weights relative to each of the k topics. Each row corresponds to a single document, and each column correspond to a topic.

In [45]:
W.shape

(11643, 10)

The H factor contains the term weights relative to each of the k topics. In this case, each row corresponds to a topic, and each column corresponds to a unique term in the corpus vocabulary.

In [46]:
H.shape

(10, 24123)

In [47]:
top_indices = np.argsort(H[1,:])[::-1]

In [48]:
def get_descriptor(terms, H, topic_index, top):
    #reverse sort the values to sort the indices
    top_indices = np.argsort(H[topic_index,:])[::-1]
    top_terms = []
    for term_index in top_indices[0:top]:
        top_terms.append(terms[term_index])
    return top_terms

In [49]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [50]:
print_top_words(nmf_model,tf_idf.get_feature_names(),10)

Topic #0:
business analysis financial project analytics reporting ability management process analytical
Topic #1:
security network system service support server infrastructure customer technology technical
Topic #2:
project engineering design equipment process manufacturing system required material construction
Topic #3:
machine_learning model experience algorithm science statistic statistical technique research python
Topic #4:
product design user customer experience team feature technical research designer
Topic #5:
sale marketing customer market client account strategy content campaign channel
Topic #6:
preferred year required bachelor experience united_state_required united_state must skill equivalent
Topic #7:
experience software design development year technology system java engineering architecture
Topic #8:
test testing automation qa automated system requirement test_case software development
Topic #9:
company employee paid benefit medical_dental_vision flexible insurance oppor