In [1]:
import numpy as np

import nltk
import pandas as pd
import gensim

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
df = pd.read_pickle('Data/clean_seattle')

In [3]:
len(df)

2908

## N-grams process flow

1. create ngrams for corpus  
* create ngrams for individual documents  
* get value counts of ngram appearances
* check which ngrams appear most in the entire corpus - set arbitrary cutoff for number of bigrams based on quality
* Remove redundant bigrams, trigrams, and quadgrams
* check which of those selected ngrams are in each document
* combine with existing documents

### Combine documents and split into one list of words

In [4]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))


In [5]:
clean_sents = list(sent_to_words(df.listed_items))
all_words = [item for sublist in clean_sents for item in sublist]

### Create ngrams from corpus

#### Bigrams

In [6]:
bigrams = ['_'.join(x) for x in zip(all_words, all_words[1:])]
len(bigrams)

457518

In [7]:
top_bigrams = pd.Series(bigrams).value_counts()[:100].index
list(top_bigrams[:5])

['year_experi',
 'comput_scienc',
 'machin_learn',
 'bachelor_degre',
 'communic_skill']

#### Trigrams

In [8]:
trigrams = ['_'.join(x) for x in zip(all_words, all_words[1:], all_words[2:])]
len(trigrams)

457517

Looks like medic_dental_vision appears a lot... I'll make a note to get rid of that one.

In [9]:
top_trigrams = pd.Series(trigrams).value_counts()[:50].index
list(top_trigrams[:5])

['degre_comput_scienc',
 'comput_scienc_relat',
 'bachelor_degre_comput',
 'written_communic_skill',
 'scienc_relat_field']

#### Quadgrams

In [10]:
quadgrams = ['_'.join(x) for x in zip(all_words, all_words[1:], all_words[2:], all_words[3:])]
len(quadgrams)

457516

Think about finding a way to handle the fact that these and similar items are treated as three separate items:
* verbal_written_communic_skill        203
* written_verbal_communic_skill        194
* excel_verbal_written_communic        121

In [11]:
top_quadgrams = pd.Series(quadgrams).value_counts().index[:20]
list(top_quadgrams)[:5]

['bachelor_degre_comput_scienc',
 'comput_scienc_relat_field',
 'degre_comput_scienc_relat',
 'verbal_written_communic_skill',
 'written_verbal_communic_skill']

### Remove redundant bigrams, trigrams, and quadgrams and combine to create a single list

In [12]:
non_redundant_bigrams = []
trigrams_string = ' '.join(top_trigrams)

for bigram in top_bigrams:
    if bigram in trigrams_string:
        pass
    else:
        non_redundant_bigrams.append(bigram)
print('There are', len(non_redundant_bigrams), 'non-redundant bigrams')
non_redundant_bigrams[:5]

There are 56 non-redundant bigrams


['machin_learn',
 'best_practic',
 'project_manag',
 'product_manag',
 'experi_build']

In [13]:
non_redundant_trigrams = []
quadgrams_string = ' '.join(top_quadgrams)

for trigram in top_trigrams:
    if trigram in quadgrams_string:
        pass
    else:
        non_redundant_trigrams.append(trigram)
print('There are', len(non_redundant_trigrams), 'non-redundant trigrams')
non_redundant_trigrams[:5]

There are 30 non-redundant trigrams


['problem_solv_skill',
 'fast_pace_environ',
 'year_relev_experi',
 'communic_skill_abil',
 'comput_scienc_fundament']

In [14]:
top_ngrams = ' '.join([' '.join(non_redundant_bigrams),
                       ' '.join(non_redundant_trigrams),
                       ' '.join(top_quadgrams)])

### Convert documents to ngrams

In [15]:
def to_ngram(text):
    
    out = []
    
    for i, j in zip(text.split(), text.split()[1:]):
        bigram = '_'.join([i, j])
        out.append(bigram)
        
    for i, j, k in zip(text.split(), text.split()[1:], text.split()[2:]):
        trigram = '_'.join([i, j, k])
        out.append(trigram)
        
    for i, j, k, l in zip(text.split(), text.split()[1:],
                          text.split()[2:], text.split()[3:]):
        quadgram = '_'.join([i, j, k, l])
        out.append(quadgram) 
        
    return out        

In [16]:
ngram_df = pd.DataFrame(df.listed_items.apply(to_ngram))
ngram_df.head()

Unnamed: 0,listed_items
0,"[develop_high, high_scalabl, scalabl_classifi,..."
1,"[play_pivot, pivot_role, role_modern, modern_s..."
2,"[act_contribut, contribut_ux, ux_design, desig..."
3,"[manag_6, 6_8, 8_technic, technic_product, pro..."
4,"[lead_grow, grow_appli, appli_scientist, scien..."


### Find matching ngrams

Need to edit this function:  
if ngram == any ngram in top ngrams

Currenty the 'in' will include any bigrams that are within a trigram effectively nullifying the result of the *Remove redundant bigrams, trigrams, and quadgrams and combine to create a single list* step.

In [19]:
def check_matching_ngrams(doc_ngrams):
    
    matching_ngrams = []
    
    top_ngrams_list = top_ngrams.split()
    
    for doc_ngram in doc_ngrams:
        for top_ngram in top_ngrams_list:
            if doc_ngram == top_ngram:
                matching_ngrams.append(doc_ngram)
            else:
                pass
    return ' '.join(matching_ngrams)

In [20]:
matching_ngram_df = pd.DataFrame(ngram_df.listed_items.apply(check_matching_ngrams))
matching_ngram_df.head()

Unnamed: 0,listed_items
0,machin_learn machin_learn quantit_field quanti...
1,best_practic demonstr_abil best_practic best_p...
2,busi_requir design_develop design_implement cu...
3,product_manag product_manag busi_requir experi...
4,machin_learn long_term softwar_engin engin_tea...


### Combine ngrams with document text

In [21]:
df['listed_items'] = df.listed_items + matching_ngram_df.listed_items

## Testing TF-IDF NMF with new ngrams

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
tf_idf = TfidfVectorizer(max_df=0.95)
tf_idf_array = tf_idf.fit_transform(df.listed_items).toarray()
tf_idf_df = pd.DataFrame(tf_idf_array,columns=tf_idf.get_feature_names())
tf_idf_df.shape

(2908, 10016)

In [24]:
nmf_model = NMF(n_components=10, random_state=42)
nmf = nmf_model.fit_transform(tf_idf_df)

In [25]:
W = nmf
H = nmf_model.components_

The W factor contains the document membership weights relative to each of the k topics. Each row corresponds to a single document, and each column correspond to a topic.

In [26]:
W.shape

(2908, 10)

The H factor contains the term weights relative to each of the k topics. In this case, each row corresponds to a topic, and each column corresponds to a unique term in the corpus vocabulary.

In [27]:
H.shape

(10, 10016)

In [28]:
top_indices = np.argsort(H[1,:])[::-1]

In [29]:
def get_descriptor(terms, H, topic_index, top):
    #reverse sort the values to sort the indices
    top_indices = np.argsort(H[topic_index,:])[::-1]
    top_terms = []
    for term_index in top_indices[0:top]:
        top_terms.append(terms[term_index])
    return top_terms

In [30]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [32]:
print_top_words(nmf_model,tf_idf.get_feature_names(),10)

Topic #0:
experi system softwar servic cloud technolog develop build engin scale
Topic #1:
comput_scienc_fundament comput scienc fundament degre_comput_scienc_relat object_orient_design object orient bachelor_degre_comput_scienc algorithm
Topic #2:
learn machin machin_learn model experi statist comput languag phd algorithm
Topic #3:
market custom product sale content strategi execut busi partner develop
Topic #4:
requir support project document process system assist provid manag maintain
Topic #5:
busi analyt statist analysi sql model report experi quantit insight
Topic #6:
test autom develop experi qa qualiti softwar framework methodolog script
Topic #7:
paid compani custom benefit employe user time dental dog help
Topic #8:
manag project abil busi technic skill team communic experi function
Topic #9:
design product experi engin manufactur develop research prototyp user softwar

