In [7]:
import numpy as np

import nltk
import pandas as pd
import gensim

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
df = pd.read_pickle('Data/clean_seattle')

In [3]:
len(df)

2908

## Unigrams

## N-grams

### Combine documents and split into one list of words

In [8]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))


In [10]:
clean_sents = list(sent_to_words(df.listed_items))
all_words = [item for sublist in clean_sents for item in sublist]

### Create ngrams for corpus

#### Bigrams

In [63]:
bigrams = ['_'.join(x) for x in zip(all_words, all_words[1:])]
len(bigrams)

457518

In [71]:
top_bigrams = pd.Series(bigrams).value_counts()[:100].index
list(top_bigrams[:5])

['year_experi',
 'comput_scienc',
 'machin_learn',
 'bachelor_degre',
 'communic_skill']

#### Trigrams

In [62]:
trigrams = ['_'.join(x) for x in zip(all_words, all_words[1:], all_words[2:])]
len(trigrams)

457517

Looks like medic_dental_vision appears a lot... I'll make a note to get rid of that one.

In [70]:
top_trigrams = pd.Series(trigrams).value_counts()[:50].index
list(top_trigrams[:5])

['degre_comput_scienc',
 'comput_scienc_relat',
 'bachelor_degre_comput',
 'written_communic_skill',
 'scienc_relat_field']

#### Quadgrams

In [72]:
quadgrams = ['_'.join(x) for x in zip(all_words, all_words[1:], all_words[2:], all_words[3:])]
len(quadgrams)

457516

Think about finding a way to handle the fact that these and similar items are treated as three separate items:
* verbal_written_communic_skill        203
* written_verbal_communic_skill        194
* excel_verbal_written_communic        121

In [78]:
top_quadgrams = pd.Series(quadgrams).value_counts().index[:20]
list(top_quadgrams)[:5]

['bachelor_degre_comput_scienc',
 'comput_scienc_relat_field',
 'degre_comput_scienc_relat',
 'verbal_written_communic_skill',
 'written_verbal_communic_skill']

### Remove redundant bigrams, trigrams, and quadgrams and combine to create a single list

In [92]:
non_redundant_bigrams = []
trigrams_string = ' '.join(top_trigrams)

for bigram in top_bigrams:
    if bigram in trigrams_string:
        pass
    else:
        non_redundant_bigrams.append(bigram)
print('There are ', len(non_redundant_bigrams), 'non-redundant bigrams')
non_redundant_bigrams[:5]

There are  56 non redundant bigrams


['machin_learn',
 'best_practic',
 'project_manag',
 'product_manag',
 'experi_build']

In [93]:
non_redundant_trigrams = []
quadgrams_string = ' '.join(top_quadgrams)

for trigram in top_trigrams:
    if trigram in quadgrams_string:
        pass
    else:
        non_redundant_trigrams.append(trigram)
print('There are ', len(non_redundant_trigrams), 'non-redundant bigrams')
non_redundant_trigrams[:5]

There are  30 non-redundant bigrams


['problem_solv_skill',
 'fast_pace_environ',
 'year_relev_experi',
 'communic_skill_abil',
 'comput_scienc_fundament']

In [96]:
top_ngrams = ' '.join([' '.join(non_redundant_bigrams), ' '.join(non_redundant_trigrams), ' '.join(top_quadgrams)])

### Convert documents to ngrams

In [97]:
def to_ngram(text):
    
    out = []
    
    for i, j in zip(text.split(), text.split()[1:]):
        bigram = '_'.join([i, j])
        out.append(bigram)
        
    for i, j, k in zip(text.split(), text.split()[1:], text.split()[2:]):
        trigram = '_'.join([i, j, k])
        out.append(trigram)
        
    for i, j, k, l in zip(text.split(), text.split()[1:],
                          text.split()[2:], text.split()[3:]):
        quadgram = '_'.join([i, j, k, l])
        out.append(quadgram) 
        
    return out        

In [101]:
ngram_df = pd.DataFrame(df.listed_items.apply(to_ngram))
ngram_df.head()

Unnamed: 0,listed_items
0,"[develop_high, high_scalabl, scalabl_classifi,..."
1,"[play_pivot, pivot_role, role_modern, modern_s..."
2,"[act_contribut, contribut_ux, ux_design, desig..."
3,"[manag_6, 6_8, 8_technic, technic_product, pro..."
4,"[lead_grow, grow_appli, appli_scientist, scien..."


In [None]:
def check_matching_ngrams(doc_ngrams):
    

1. create bigrams for corpus  
* create bigrams for individual documents  
* get value counts (using numpy???) of bigram appearances
* check which bigrams appear most in the entire corpus - set arbitrary cutoff for number of bigrams based on quality
* Remove redundant bigrams, trigrams, and quadgrams
* check which of those selected bigrams are in each document
    * to do this, find the intersection of a set for each document

In [19]:
trigram = gensim.models.phrases.Phrases(bigram[all_words], min_count=50)



In [21]:
bigram_model = gensim.models.phrases.Phraser(bigram)
trigram_model = gensim.models.phrases.Phraser(trigram)

In [22]:
for s in clean_sents[0:5]:
    print(f'{" ".join(trigram_model[bigram_model[s]]) } \n')

develop high scalabl classifi tool leverag machin learn regress rule base model suggest collect synthes requir creat effect featur roadmap deliver tandem engin adapt standard machin learn method best exploit modern parallel environ distribut cluster multicor smp gpu ms degre comput scienc relat quantit field ph degre comput scienc relat quantit field year experi one follow area machin learn recommend system pattern recognit mine artifici intellig proven abil translat insight busi recommend experi hadoop hbase pig mapreduc sawzal bigtabl knowledg develop debug java experi script languag perl python php shell script experi filesystem server architectur distribut system 

play pivot role modern standard enhanc peoplesoft secur util modern industri best practic provid secur strategi guidelin develop lead strategi protect softwar secur issu infiltr attempt social engin threat drive secur busi process system reengin effort strong background softwar develop inform secur busi analysi experi la

## Trigrams

In [16]:
cv = CountVectorizer(max_df=.95,min_df=2)
cv_array = cv.fit_transform(df.listed_items).toarray()
cv_df = pd.DataFrame(cv_array,columns=cv.get_feature_names())
cv_df.head()

Unnamed: 0,00,000,00am,00pm,08,09,10,100,1000,100k,...,zero,zestim,zillow,zip,zipwhip,zone,zoom,zulili,zunivers,ﬁling
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0


In [17]:
nmf_model = NMF(n_components=10, random_state=42)
nmf = nmf_model.fit_transform(cv_df)

In [18]:
W = nmf
H = nmf_model.components_

The W factor contains the document membership weights relative to each of the k topics. Each row corresponds to a single document, and each column correspond to a topic.

In [19]:
W.shape

(2908, 10)

The H factor contains the term weights relative to each of the k topics. In this case, each row corresponds to a topic, and each column corresponds to a unique term in the corpus vocabulary.

In [20]:
H.shape

(10, 6091)

In [21]:
top_indices = np.argsort(H[1,:])[::-1]

In [22]:
def get_descriptor(terms, H, topic_index, top):
    #reverse sort the values to sort the indices
    top_indices = np.argsort(H[topic_index,:])[::-1]
    top_terms = []
    for term_index in top_indices[0:top]:
        top_terms.append(terms[term_index])
    return top_terms

In [23]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [24]:
print_top_words(nmf_model,cv.get_feature_names(),10)

Topic #0:
experi, year, tool, strong, etc, develop, environ, degre, knowledg, analysi
Topic #1:
busi, analyt, solut, develop, analysi, support, requir, partner, process, model
Topic #2:
system, support, oper, servic, manag, network, secur, technic, technolog, perform
Topic #3:
product, custom, market, manag, develop, engin, experi, sale, team, technic
Topic #4:
research, analysi, analyt, quantit, energi, design, complex, result, help, experi
Topic #5:
requir, may, document, posit, includ, educ, experi, applic, must, inform
Topic #6:
project, manag, develop, program, plan, process, report, resourc, includ, coordin
Topic #7:
design, test, develop, engin, softwar, system, product, requir, tool, process
Topic #8:
abil, skill, communic, strong, excel, demonstr, technic, written, effect, verbal
Topic #9:
learn, comput, scienc, machin, engin, model, softwar, build, develop, statist

