In [1]:
import numpy as np
import pandas as pd

import gensim
from sklearn.feature_extraction.text import TfidfVectorizer

import pickle

In [2]:
df = pd.read_pickle('../Data/01_clean_sf')

In [3]:
len(df)

3364

## N-grams process flow
_(For creating custom ngram tools)_

1. create ngrams for corpus  
* create ngrams for individual documents  
* get value counts of ngram appearances
* check which ngrams appear most in the entire corpus - set arbitrary cutoff for number of bigrams based on quality
* Remove redundant bigrams, trigrams, and quadgrams
* check which of those selected ngrams are in each document
* combine with existing documents

### Combine documents and split into one list of words

In [4]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))


In [5]:
clean_sents = list(sent_to_words(df.listed_items))
all_words = [item for sublist in clean_sents for item in sublist]

### Create ngrams from corpus

#### Bigrams

In [6]:
bigrams = ['_'.join(x) for x in zip(all_words, all_words[1:])]
len(bigrams)

445439

In [7]:
top_bigrams = pd.Series(bigrams).value_counts()[:100].index
list(top_bigrams[:5])

['machine_learning',
 'cross_functional',
 'related_field',
 'problem_solving',
 'attention_detail']

#### Trigrams

In [8]:
trigrams = ['_'.join(x) for x in zip(all_words, all_words[1:], all_words[2:])]
len(trigrams)

445438

In [9]:
list(pd.Series(trigrams).value_counts()[1:25].index)

['cross_functional_team',
 'verbal_written_communication',
 'proven_track_record',
 'machine_learning_model',
 'oral_written_communication',
 'analytical_problem_solving',
 'subject_matter_expert',
 'written_oral_communication',
 'machine_learning_algorithm',
 'programming_language_python',
 'related_technical_field',
 'machine_learning_technique',
 'natural_language_processing',
 'strong_problem_solving',
 'stakeholder_cell_product',
 'cell_product_city',
 'strong_attention_detail',
 'united_state_required',
 'strong_written_verbal',
 'bachelor_related_field',
 'strong_verbal_written',
 'ad_hoc_analysis',
 'product_manager_engineer',
 'cross_functional_partner']

In [10]:
top_trigrams = pd.Series(trigrams).value_counts()[1:25].index
list(top_trigrams[:5])

['cross_functional_team',
 'verbal_written_communication',
 'proven_track_record',
 'machine_learning_model',
 'oral_written_communication']

#### Quadgrams

In [11]:
quadgrams = ['_'.join(x) for x in zip(all_words, all_words[1:], all_words[2:], all_words[3:])]
len(quadgrams)

445437

In [12]:
list(pd.Series(quadgrams).value_counts().index[:10])

['stakeholder_cell_product_city',
 'strong_written_verbal_communication',
 'strong_verbal_written_communication',
 'strong_analytical_problem_solving',
 'machine_learning_artificial_intelligence',
 'machine_learning_deep_learning',
 'required_united_state_required',
 'programming_language_python_java',
 'strong_written_oral_communication',
 'strong_oral_written_communication']

In [13]:
top_quadgrams = list(pd.Series(quadgrams).value_counts().index[:10])
top_quadgrams[:5]

['stakeholder_cell_product_city',
 'strong_written_verbal_communication',
 'strong_verbal_written_communication',
 'strong_analytical_problem_solving',
 'machine_learning_artificial_intelligence']

### Remove redundant bigrams, trigrams, and quadgrams and combine to create a single list

In [14]:
non_redundant_bigrams = []
trigrams_string = ' '.join(top_trigrams)

for bigram in top_bigrams:
    if bigram in trigrams_string:
        pass
    else:
        non_redundant_bigrams.append(bigram)
print('There are', len(non_redundant_bigrams), 'non-redundant bigrams')
non_redundant_bigrams[:5]

There are 71 non-redundant bigrams


['product_management',
 'project_management',
 'product_development',
 'large_scale',
 'internal_external']

In [15]:
non_redundant_trigrams = []
quadgrams_string = ' '.join(top_quadgrams)

for trigram in top_trigrams:
    if trigram in quadgrams_string:
        pass
    else:
        non_redundant_trigrams.append(trigram)
print('There are', len(non_redundant_trigrams), 'non-redundant trigrams')
non_redundant_trigrams[:5]

There are 14 non-redundant trigrams


['cross_functional_team',
 'proven_track_record',
 'machine_learning_model',
 'subject_matter_expert',
 'machine_learning_algorithm']

In [16]:
top_ngrams = ' '.join([' '.join(non_redundant_bigrams),
                       ' '.join(non_redundant_trigrams),
                       ' '.join(top_quadgrams)])

In [17]:
pickle_out = open("../Tools_and_models/top_ngrams","wb")
pickle.dump(top_ngrams, pickle_out)
pickle_out.close()

### Convert documents to ngrams

In [18]:
def to_ngram(text):
    
    out = []
    
    for i, j in zip(text.split(), text.split()[1:]):
        bigram = '_'.join([i, j])
        out.append(bigram)
        
    for i, j, k in zip(text.split(), text.split()[1:], text.split()[2:]):
        trigram = '_'.join([i, j, k])
        out.append(trigram)
        
    for i, j, k, l in zip(text.split(), text.split()[1:],
                          text.split()[2:], text.split()[3:]):
        quadgram = '_'.join([i, j, k, l])
        out.append(quadgram) 
        
    return out        

In [19]:
ngram_df = pd.DataFrame(df.listed_items.apply(to_ngram))
ngram_df.head()

Unnamed: 0,listed_items
0,"[write_build, build_product, product_according..."
1,"[closely_product, product_assist, assist_inves..."
2,"[proficiency_gi, gi_e, e_g, g_arcgis, arcgis_e..."
3,"[effectively_interpret, interpret_client, clie..."
4,"[maintain_existing, existing_implement, implem..."


### Find matching ngrams

In [20]:
def check_matching_ngrams(doc_ngrams):
    
    matching_ngrams = []
    
    top_ngrams_list = top_ngrams.split()
    
    for doc_ngram in doc_ngrams:
        for top_ngram in top_ngrams_list:
            if doc_ngram == top_ngram:
                matching_ngrams.append(doc_ngram)
            else:
                pass
    return ' '.join(matching_ngrams)

Note that the below _listed_items_ is empty not because there are no listed_items, but because it had no matching ngrams.

In [21]:
matching_ngram_df = pd.DataFrame(ngram_df.listed_items.apply(check_matching_ngrams))
matching_ngram_df.head()

Unnamed: 0,listed_items
0,product_management business_problem business_p...
1,technical_technical business_problem self_star...
2,
3,
4,design_develop strong_knowledge strong_knowled...


### Combine ngrams with document text

In [22]:
df['listed_items'] = df.listed_items + matching_ngram_df.listed_items

In [23]:
df.reset_index(inplace=True,drop=True)

In [24]:
df.to_pickle('../Data/01_clean_sf_custom_ngram')