In [6]:
import numpy as np
import pandas as pd

import gensim

from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
df = pd.read_pickle('../Data/01_clean_sf')

In [8]:
len(df)

3416

## N-grams process flow

1. create ngrams for corpus  
* create ngrams for individual documents  
* get value counts of ngram appearances
* check which ngrams appear most in the entire corpus - set arbitrary cutoff for number of bigrams based on quality
* Remove redundant bigrams, trigrams, and quadgrams
* check which of those selected ngrams are in each document
* combine with existing documents

### Combine documents and split into one list of words

In [9]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))


In [10]:
clean_sents = list(sent_to_words(df.listed_items))
all_words = [item for sublist in clean_sents for item in sublist]

### Create ngrams from corpus

#### Bigrams

In [11]:
bigrams = ['_'.join(x) for x in zip(all_words, all_words[1:])]
len(bigrams)

566965

In [16]:
top_bigrams = pd.Series(bigrams).value_counts()[:100].index
list(top_bigrams[:5])

['year_experience',
 'machine_learning',
 'communication_skill',
 'computer_science',
 'experience_working']

#### Trigrams

In [17]:
trigrams = ['_'.join(x) for x in zip(all_words, all_words[1:], all_words[2:])]
len(trigrams)

566964

In [20]:
list(pd.Series(trigrams).value_counts()[1:25].index)

['degree_computer_science',
 'written_communication_skill',
 'written_verbal_communication',
 'cross_functional_team',
 'verbal_communication_skill',
 'computer_science_related',
 'verbal_written_communication',
 'communication_skill_ability',
 'fast_paced_environment',
 'problem_solving_skill',
 'excellent_communication_skill',
 'excellent_written_verbal',
 'science_related_field',
 'year_relevant_experience',
 'minimum_year_experience',
 'bachelor_degree_computer',
 'proven_track_record',
 'dental_vision_benefit',
 'related_field_year',
 'year_industry_experience',
 'least_year_experience',
 'computer_science_engineering',
 'excellent_verbal_written',
 'technical_non_technical']

In [21]:
top_trigrams = pd.Series(trigrams).value_counts()[1:25].index
list(top_trigrams[:5])

['degree_computer_science',
 'written_communication_skill',
 'written_verbal_communication',
 'cross_functional_team',
 'verbal_communication_skill']

#### Quadgrams

In [22]:
quadgrams = ['_'.join(x) for x in zip(all_words, all_words[1:], all_words[2:], all_words[3:])]
len(quadgrams)

566963

Think about finding a way to handle the fact that these and similar items are treated as three separate items:
* verbal_written_communic_skill        203
* written_verbal_communic_skill        194
* excel_verbal_written_communic        121

In [24]:
list(pd.Series(quadgrams).value_counts().index[:10])

['written_verbal_communication_skill',
 'verbal_written_communication_skill',
 'bachelor_degree_computer_science',
 'excellent_written_verbal_communication',
 'computer_science_related_field',
 'medical_dental_vision_benefit',
 'excellent_verbal_written_communication',
 'oral_written_communication_skill',
 'written_oral_communication_skill',
 'degree_computer_science_related']

In [30]:
top_quadgrams = list(pd.Series(quadgrams).value_counts().index[:10])
top_quadgrams.remove('medical_dental_vision_benefit')
top_quadgrams[:5]

['written_verbal_communication_skill',
 'verbal_written_communication_skill',
 'bachelor_degree_computer_science',
 'excellent_written_verbal_communication',
 'computer_science_related_field']

### Remove redundant bigrams, trigrams, and quadgrams and combine to create a single list

In [31]:
non_redundant_bigrams = []
trigrams_string = ' '.join(top_trigrams)

for bigram in top_bigrams:
    if bigram in trigrams_string:
        pass
    else:
        non_redundant_bigrams.append(bigram)
print('There are', len(non_redundant_bigrams), 'non-redundant bigrams')
non_redundant_bigrams[:5]

There are 70 non-redundant bigrams


['machine_learning',
 'experience_working',
 'best_practice',
 'medical_dental',
 'experience_building']

In [32]:
non_redundant_trigrams = []
quadgrams_string = ' '.join(top_quadgrams)

for trigram in top_trigrams:
    if trigram in quadgrams_string:
        pass
    else:
        non_redundant_trigrams.append(trigram)
print('There are', len(non_redundant_trigrams), 'non-redundant trigrams')
non_redundant_trigrams[:5]

There are 14 non-redundant trigrams


['cross_functional_team',
 'communication_skill_ability',
 'fast_paced_environment',
 'problem_solving_skill',
 'excellent_communication_skill']

In [33]:
top_ngrams = ' '.join([' '.join(non_redundant_bigrams),
                       ' '.join(non_redundant_trigrams),
                       ' '.join(top_quadgrams)])

### Convert documents to ngrams

In [34]:
def to_ngram(text):
    
    out = []
    
    for i, j in zip(text.split(), text.split()[1:]):
        bigram = '_'.join([i, j])
        out.append(bigram)
        
    for i, j, k in zip(text.split(), text.split()[1:], text.split()[2:]):
        trigram = '_'.join([i, j, k])
        out.append(trigram)
        
    for i, j, k, l in zip(text.split(), text.split()[1:],
                          text.split()[2:], text.split()[3:]):
        quadgram = '_'.join([i, j, k, l])
        out.append(quadgram) 
        
    return out        

In [35]:
ngram_df = pd.DataFrame(df.listed_items.apply(to_ngram))
ngram_df.head()

Unnamed: 0,listed_items
0,"[write_build, build_product, product_according..."
1,"[closely_product, product_assist, assist_inves..."
2,"[proficiency_gi, gi_software, software_e, e_g,..."
3,"[effectively_interpret, interpret_client, clie..."
4,"[maintain_existing, existing_software, softwar..."


### Find matching ngrams

Need to edit this function:  
if ngram == any ngram in top ngrams

Currenty the 'in' will include any bigrams that are within a trigram effectively nullifying the result of the *Remove redundant bigrams, trigrams, and quadgrams and combine to create a single list* step.

In [36]:
def check_matching_ngrams(doc_ngrams):
    
    matching_ngrams = []
    
    top_ngrams_list = top_ngrams.split()
    
    for doc_ngram in doc_ngrams:
        for top_ngram in top_ngrams_list:
            if doc_ngram == top_ngram:
                matching_ngrams.append(doc_ngram)
            else:
                pass
    return ' '.join(matching_ngrams)

In [37]:
matching_ngram_df = pd.DataFrame(ngram_df.listed_items.apply(check_matching_ngrams))
matching_ngram_df.head()

Unnamed: 0,listed_items
0,product_management product_manager software_de...
1,experience_working experience_using least_one ...
2,experience_working experience_using attention_...
3,high_quality
4,design_develop software_development developmen...


### Combine ngrams with document text

In [38]:
df['listed_items'] = df.listed_items + matching_ngram_df.listed_items

In [40]:
df.to_pickle('../Data/01_clean_sf_custom_ngram')