In [6]:
import numpy as np
import pandas as pd

import gensim

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_pickle('../Data/00_clean_data')

In [3]:
len(df)

11643

## N-grams process flow

1. create ngrams for corpus  
* create ngrams for individual documents  
* get value counts of ngram appearances
* check which ngrams appear most in the entire corpus - set arbitrary cutoff for number of bigrams based on quality
* Remove redundant bigrams, trigrams, and quadgrams
* check which of those selected ngrams are in each document
* combine with existing documents

### Combine documents and split into one list of words

In [4]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))


In [7]:
clean_sents = list(sent_to_words(df.listed_items))
all_words = [item for sublist in clean_sents for item in sublist]

### Create ngrams from corpus

#### Bigrams

In [29]:
bigrams = ['_'.join(x) for x in zip(all_words, all_words[1:])]
len(bigrams)

2023512

In [30]:
top_bigrams = pd.Series(bigrams).value_counts()[:210].index
# note that the value_count for the 210th bigram is 331
list(top_bigrams[:5])

['year_experience',
 'communication_skill',
 'computer_science',
 'bachelor_degree',
 'machine_learning']

#### Trigrams

In [31]:
trigrams = ['_'.join(x) for x in zip(all_words, all_words[1:], all_words[2:])]
len(trigrams)

2023511

Looks like medic_dental_vision appears a lot... I'll make a note to get rid of that one.

In [39]:
top_trigrams = pd.Series(trigrams).value_counts()[:100].index
# note that the value_count for the 100th trigram is 169
list(top_trigrams[:5])

['degree_computer_science',
 'written_communication_skill',
 'verbal_written_communication',
 'computer_science_related',
 'written_verbal_communication']

#### Quadgrams

In [40]:
quadgrams = ['_'.join(x) for x in zip(all_words, all_words[1:], all_words[2:], all_words[3:])]
len(quadgrams)

2023510

Think about finding a way to handle the fact that these and similar items are treated as three separate items:
* verbal_written_communic_skill        203
* written_verbal_communic_skill        194
* excel_verbal_written_communic        121

In [47]:
top_quadgrams = pd.Series(quadgrams).value_counts().index[:32]
# note that the value_count for the 100th quadgram is 124
list(top_quadgrams)[:5]

['bachelor_degree_computer_science',
 'written_verbal_communication_skill',
 'verbal_written_communication_skill',
 'computer_science_related_field',
 'degree_computer_science_related']

### Remove redundant bigrams, trigrams, and quadgrams and combine to create a single list

In [48]:
non_redundant_bigrams = []
trigrams_string = ' '.join(top_trigrams)

for bigram in top_bigrams:
    if bigram in trigrams_string:
        pass
    else:
        non_redundant_bigrams.append(bigram)
print('There are', len(non_redundant_bigrams), 'non-redundant bigrams')
non_redundant_bigrams[:5]

There are 122 non-redundant bigrams


['best_practice',
 'working_knowledge',
 'experience_building',
 'large_scale',
 'demonstrated_ability']

In [49]:
non_redundant_trigrams = []
quadgrams_string = ' '.join(top_quadgrams)

for trigram in top_trigrams:
    if trigram in quadgrams_string:
        pass
    else:
        non_redundant_trigrams.append(trigram)
print('There are', len(non_redundant_trigrams), 'non-redundant trigrams')
non_redundant_trigrams[:5]

There are 67 non-redundant trigrams


['minimum_year_experience',
 'fast_paced_environment',
 'cross_functional_team',
 'year_relevant_experience',
 'excellent_communication_skill']

In [50]:
top_ngrams = ' '.join([' '.join(non_redundant_bigrams),
                       ' '.join(non_redundant_trigrams),
                       ' '.join(top_quadgrams)])

### Convert documents to ngrams

In [51]:
def to_ngram(text):
    
    out = []
    
    for i, j in zip(text.split(), text.split()[1:]):
        bigram = '_'.join([i, j])
        out.append(bigram)
        
    for i, j, k in zip(text.split(), text.split()[1:], text.split()[2:]):
        trigram = '_'.join([i, j, k])
        out.append(trigram)
        
    for i, j, k, l in zip(text.split(), text.split()[1:],
                          text.split()[2:], text.split()[3:]):
        quadgram = '_'.join([i, j, k, l])
        out.append(quadgram) 
        
    return out        

In [52]:
ngram_df = pd.DataFrame(df.listed_items.apply(to_ngram))
ngram_df.head()

Unnamed: 0,listed_items
0,"[develop_analyze, analyze_2d, 2d_3d, 3d_cellul..."
2,"[write_develop, develop_analyze, analyze_eleva..."
3,"[provide_high, high_quality, quality_support, ..."
5,"[structure_solution, solution_complex, complex..."
7,"[translate_business, business_requirement, req..."


### Find matching ngrams

Need to edit this function:  
if ngram == any ngram in top ngrams

Currenty the 'in' will include any bigrams that are within a trigram effectively nullifying the result of the *Remove redundant bigrams, trigrams, and quadgrams and combine to create a single list* step.

In [53]:
def check_matching_ngrams(doc_ngrams):
    
    matching_ngrams = []
    
    top_ngrams_list = top_ngrams.split()
    
    for doc_ngram in doc_ngrams:
        for top_ngram in top_ngrams_list:
            if doc_ngram == top_ngram:
                matching_ngrams.append(doc_ngram)
            else:
                pass
    return ' '.join(matching_ngrams)

In [54]:
matching_ngram_df = pd.DataFrame(ngram_df.listed_items.apply(check_matching_ngrams))
matching_ngram_df.head()

Unnamed: 0,listed_items
0,plus_experience plus_experience demonstrated_a...
2,user_experience learn_new solve_problem design...
3,high_quality customer_service customer_service...
5,business_problem business_problem advanced_deg...
7,business_requirement develop_maintain relation...


### Combine ngrams with document text

In [55]:
df['listed_items'] = df.listed_items + matching_ngram_df.listed_items

In [57]:
df.to_pickle('../Data/00_clean_custom_ngram')